2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
115 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
117 void print_time(FILE *out,
118 gmx_walltime_accounting_t walltime_accounting,
121 t_commrec gmx_unused *cr)
124 char timebuf[STRLEN];
125 double dt, elapsed_seconds, time_per_step;
134 fprintf(out, "step %s", gmx_step_str(step, buf));
137 if ((step >= ir->nstlist))
139 double seconds_since_epoch = gmx_gettime();
140 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
141 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
142 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
148 finish = (time_t) (seconds_since_epoch + dt);
149 gmx_ctime_r(&finish, timebuf, STRLEN);
150 sprintf(buf, "%s", timebuf);
151 buf[strlen(buf)-1] = '\0';
152 fprintf(out, ", will finish %s", buf);
156 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
161 fprintf(out, " performance: %.1f ns/day ",
162 ir->delta_t/1000*24*60*60/time_per_step);
175 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
178 char time_string[STRLEN];
187 char timebuf[STRLEN];
188 time_t temp_time = (time_t) the_time;
190 gmx_ctime_r(&temp_time, timebuf, STRLEN);
191 for (i = 0; timebuf[i] >= ' '; i++)
193 time_string[i] = timebuf[i];
195 time_string[i] = '\0';
198 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
201 void print_start(FILE *fplog, t_commrec *cr,
202 gmx_walltime_accounting_t walltime_accounting,
207 sprintf(buf, "Started %s", name);
208 print_date_and_time(fplog, cr->nodeid, buf,
209 walltime_accounting_get_start_time_stamp(walltime_accounting));
212 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
214 /* TODO: remove this - 1 when padding is properly implemented */
215 int end = forceToAdd->size() - 1;
216 const rvec *fAdd = as_rvec_array(forceToAdd->data());
218 // cppcheck-suppress unreadVariable
219 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
220 #pragma omp parallel for num_threads(nt) schedule(static)
221 for (int i = 0; i < end; i++)
223 rvec_inc(f[i], fAdd[i]);
227 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
228 tensor vir_part, t_graph *graph, matrix box,
229 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
233 /* The short-range virial from surrounding boxes */
235 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
236 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
238 /* Calculate partial virial, for local atoms only, based on short range.
239 * Total virial is computed in global_stat, called from do_md
241 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
242 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
244 /* Add position restraint contribution */
245 for (i = 0; i < DIM; i++)
247 vir_part[i][i] += fr->vir_diag_posres[i];
250 /* Add wall contribution */
251 for (i = 0; i < DIM; i++)
253 vir_part[i][ZZ] += fr->vir_wall_z[i];
258 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
262 static void pull_potential_wrapper(t_commrec *cr,
264 matrix box, rvec x[],
268 gmx_enerdata_t *enerd,
271 gmx_wallcycle_t wcycle)
276 /* Calculate the center of mass forces, this requires communication,
277 * which is why pull_potential is called close to other communication.
278 * The virial contribution is calculated directly,
279 * which is why we call pull_potential after calc_virial.
281 wallcycle_start(wcycle, ewcPULLPOT);
282 set_pbc(&pbc, ir->ePBC, box);
284 enerd->term[F_COM_PULL] +=
285 pull_potential(ir->pull_work, mdatoms, &pbc,
286 cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
287 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
288 wallcycle_stop(wcycle, ewcPULLPOT);
291 static void pme_receive_force_ener(t_commrec *cr,
292 gmx_wallcycle_t wcycle,
293 gmx_enerdata_t *enerd,
296 real e_q, e_lj, dvdl_q, dvdl_lj;
297 float cycles_ppdpme, cycles_seppme;
299 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
300 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
302 /* In case of node-splitting, the PP nodes receive the long-range
303 * forces, virial and energy from the PME nodes here.
305 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
308 gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
309 fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
311 enerd->term[F_COUL_RECIP] += e_q;
312 enerd->term[F_LJ_RECIP] += e_lj;
313 enerd->dvdl_lin[efptCOUL] += dvdl_q;
314 enerd->dvdl_lin[efptVDW] += dvdl_lj;
318 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
320 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
323 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
324 gmx_int64_t step, real forceTolerance,
325 const rvec *x, const rvec *f)
327 real force2Tolerance = gmx::square(forceTolerance);
328 std::uintmax_t numNonFinite = 0;
329 for (int i = 0; i < md->homenr; i++)
331 real force2 = norm2(f[i]);
332 bool nonFinite = !std::isfinite(force2);
333 if (force2 >= force2Tolerance || nonFinite)
335 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
337 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
344 if (numNonFinite > 0)
346 /* Note that with MPI this fatal call on one rank might interrupt
347 * the printing on other ranks. But we can only avoid that with
348 * an expensive MPI barrier that we would need at each step.
350 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
354 static void post_process_forces(t_commrec *cr,
356 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
358 matrix box, rvec x[],
363 t_forcerec *fr, gmx_vsite_t *vsite,
370 /* Spread the mesh force on virtual sites to the other particles...
371 * This is parallellized. MPI communication is performed
372 * if the constructing atoms aren't local.
374 wallcycle_start(wcycle, ewcVSITESPREAD);
375 spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
376 (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
378 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
379 wallcycle_stop(wcycle, ewcVSITESPREAD);
381 if (flags & GMX_FORCE_VIRIAL)
383 /* Now add the forces, this is local */
384 sum_forces(f, fr->f_novirsum);
386 if (EEL_FULL(fr->eeltype))
388 /* Add the mesh contribution to the virial */
389 m_add(vir_force, fr->vir_el_recip, vir_force);
391 if (EVDW_PME(fr->vdwtype))
393 /* Add the mesh contribution to the virial */
394 m_add(vir_force, fr->vir_lj_recip, vir_force);
398 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
403 if (fr->print_force >= 0)
405 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
409 static void do_nb_verlet(t_forcerec *fr,
410 interaction_const_t *ic,
411 gmx_enerdata_t *enerd,
412 int flags, int ilocality,
416 gmx_wallcycle_t wcycle)
418 if (!(flags & GMX_FORCE_NONBONDED))
420 /* skip non-bonded calculation */
424 nonbonded_verlet_t *nbv = fr->nbv;
425 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
427 /* GPU kernel launch overhead is already timed separately */
428 if (fr->cutoff_scheme != ecutsVERLET)
430 gmx_incons("Invalid cut-off scheme passed!");
433 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
435 if (!bUsingGpuKernels)
437 /* When dynamic pair-list pruning is requested, we need to prune
438 * at nstlistPrune steps.
440 if (nbv->listParams->useDynamicPruning &&
441 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
443 /* Prune the pair-list beyond fr->ic->rlistPrune using
444 * the current coordinates of the atoms.
446 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
447 nbnxn_kernel_cpu_prune(nbvg, fr->shift_vec, nbv->listParams->rlistInner);
448 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
451 wallcycle_sub_start(wcycle, ewcsNONBONDED);
454 switch (nbvg->kernel_type)
456 case nbnxnk4x4_PlainC:
457 case nbnxnk4xN_SIMD_4xN:
458 case nbnxnk4xN_SIMD_2xNN:
459 nbnxn_kernel_cpu(nbvg,
465 enerd->grpp.ener[egCOULSR],
467 enerd->grpp.ener[egBHAMSR] :
468 enerd->grpp.ener[egLJSR]);
471 case nbnxnk8x8x8_GPU:
472 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
475 case nbnxnk8x8x8_PlainC:
476 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
481 nbvg->nbat->out[0].f,
483 enerd->grpp.ener[egCOULSR],
485 enerd->grpp.ener[egBHAMSR] :
486 enerd->grpp.ener[egLJSR]);
490 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
493 if (!bUsingGpuKernels)
495 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
498 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
499 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
501 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
503 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
504 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
506 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
510 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
512 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
513 if (flags & GMX_FORCE_ENERGY)
515 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
516 enr_nbnxn_kernel_ljc += 1;
517 enr_nbnxn_kernel_lj += 1;
520 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
521 nbvg->nbl_lists.natpair_ljq);
522 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
523 nbvg->nbl_lists.natpair_lj);
524 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
525 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
526 nbvg->nbl_lists.natpair_q);
528 if (ic->vdw_modifier == eintmodFORCESWITCH)
530 /* We add up the switch cost separately */
531 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
532 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
534 if (ic->vdw_modifier == eintmodPOTSWITCH)
536 /* We add up the switch cost separately */
537 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
538 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
540 if (ic->vdwtype == evdwPME)
542 /* We add up the LJ Ewald cost separately */
543 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
544 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
548 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
555 gmx_enerdata_t *enerd,
558 gmx_wallcycle_t wcycle)
561 nb_kernel_data_t kernel_data;
563 real dvdl_nb[efptNR];
568 /* Add short-range interactions */
569 donb_flags |= GMX_NONBONDED_DO_SR;
571 /* Currently all group scheme kernels always calculate (shift-)forces */
572 if (flags & GMX_FORCE_FORCES)
574 donb_flags |= GMX_NONBONDED_DO_FORCE;
576 if (flags & GMX_FORCE_VIRIAL)
578 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
580 if (flags & GMX_FORCE_ENERGY)
582 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
585 kernel_data.flags = donb_flags;
586 kernel_data.lambda = lambda;
587 kernel_data.dvdl = dvdl_nb;
589 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
590 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
592 /* reset free energy components */
593 for (i = 0; i < efptNR; i++)
598 assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
600 wallcycle_sub_start(wcycle, ewcsNONBONDED);
601 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
602 for (th = 0; th < nbl_lists->nnbl; th++)
606 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
607 x, f, fr, mdatoms, &kernel_data, nrnb);
609 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
612 if (fepvals->sc_alpha != 0)
614 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
615 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
619 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
620 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
623 /* If we do foreign lambda and we have soft-core interactions
624 * we have to recalculate the (non-linear) energies contributions.
626 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
628 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
629 kernel_data.lambda = lam_i;
630 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
631 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
632 /* Note that we add to kernel_data.dvdl, but ignore the result */
634 for (i = 0; i < enerd->n_lambda; i++)
636 for (j = 0; j < efptNR; j++)
638 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
640 reset_foreign_enerdata(enerd);
641 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
642 for (th = 0; th < nbl_lists->nnbl; th++)
646 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
647 x, f, fr, mdatoms, &kernel_data, nrnb);
649 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
652 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
653 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
657 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
660 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
662 return nbv != nullptr && nbv->bUseGPU;
665 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
667 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
669 /* Note that we would like to avoid this conditional by putting it
670 * into the omp pragma instead, but then we still take the full
671 * omp parallel for overhead (at least with gcc5).
675 for (int i = 0; i < n; i++)
682 #pragma omp parallel for num_threads(nth) schedule(static)
683 for (int i = 0; i < n; i++)
690 /*! \brief This routine checks if the potential energy is finite.
692 * Note that passing this check does not guarantee finite forces,
693 * since those use slightly different arithmetics. But in most cases
694 * there is just a narrow coordinate range where forces are not finite
695 * and energies are finite.
697 * \param[in] enerd The energy data; the non-bonded group energies need to be added in here before calling this routine
699 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
701 if (!std::isfinite(enerd->term[F_EPOT]))
703 gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
706 enerd->term[F_COUL_SR]);
710 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
711 t_inputrec *inputrec,
712 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
714 gmx_groups_t gmx_unused *groups,
715 matrix box, rvec x[], history_t *hist,
716 PaddedRVecVector *force,
719 gmx_enerdata_t *enerd, t_fcdata *fcd,
720 real *lambda, t_graph *graph,
721 t_forcerec *fr, interaction_const_t *ic,
722 gmx_vsite_t *vsite, rvec mu_tot,
723 double t, gmx_edsam_t ed,
726 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
727 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
731 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
732 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
733 gmx_bool bDiffKernels = FALSE;
734 rvec vzero, box_diag;
735 float cycles_pme, cycles_wait_gpu;
736 nonbonded_verlet_t *nbv = fr->nbv;
738 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
739 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
740 bFillGrid = (bNS && bStateChanged);
741 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
742 bDoForces = (flags & GMX_FORCE_FORCES);
743 bUseGPU = fr->nbv->bUseGPU;
744 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
746 /* At a search step we need to start the first balancing region
747 * somewhere early inside the step after communication during domain
748 * decomposition (and not during the previous step as usual).
751 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
753 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
759 const int homenr = mdatoms->homenr;
761 clear_mat(vir_force);
763 if (DOMAINDECOMP(cr))
765 cg1 = cr->dd->ncg_tot;
778 update_forcerec(fr, box);
780 if (inputrecNeedMutot(inputrec))
782 /* Calculate total (local) dipole moment in a temporary common array.
783 * This makes it possible to sum them over nodes faster.
785 calc_mu(start, homenr,
786 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
791 if (fr->ePBC != epbcNONE)
793 /* Compute shift vectors every step,
794 * because of pressure coupling or box deformation!
796 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
798 calc_shifts(box, fr->shift_vec);
803 put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
804 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
806 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
808 unshift_self(graph, box, x);
812 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
813 fr->shift_vec, nbv->grp[0].nbat);
816 if (!(cr->duty & DUTY_PME))
821 /* Send particle coordinates to the pme nodes.
822 * Since this is only implemented for domain decomposition
823 * and domain decomposition does not use the graph,
824 * we do not need to worry about shifting.
827 wallcycle_start(wcycle, ewcPP_PMESENDX);
829 bBS = (inputrec->nwall == 2);
833 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
836 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
837 lambda[efptCOUL], lambda[efptVDW],
838 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
841 wallcycle_stop(wcycle, ewcPP_PMESENDX);
845 /* do gridding for pair search */
848 if (graph && bStateChanged)
850 /* Calculate intramolecular shift vectors to make molecules whole */
851 mk_mshift(fplog, graph, fr->ePBC, box, x);
855 box_diag[XX] = box[XX][XX];
856 box_diag[YY] = box[YY][YY];
857 box_diag[ZZ] = box[ZZ][ZZ];
859 wallcycle_start(wcycle, ewcNS);
862 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
863 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
865 0, mdatoms->homenr, -1, fr->cginfo, x,
867 nbv->grp[eintLocal].kernel_type,
868 nbv->grp[eintLocal].nbat);
869 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
873 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
874 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
876 nbv->grp[eintNonlocal].kernel_type,
877 nbv->grp[eintNonlocal].nbat);
878 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
881 if (nbv->ngrp == 1 ||
882 nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
884 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
885 nbv->nbs, mdatoms, fr->cginfo);
889 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
890 nbv->nbs, mdatoms, fr->cginfo);
891 nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
892 nbv->nbs, mdatoms, fr->cginfo);
894 wallcycle_stop(wcycle, ewcNS);
897 /* initialize the GPU atom data and copy shift vector */
902 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
903 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
904 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
907 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
908 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
909 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
912 /* do local pair search */
915 wallcycle_start_nocount(wcycle, ewcNS);
916 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
917 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
919 nbv->listParams->rlistOuter,
920 nbv->min_ci_balanced,
921 &nbv->grp[eintLocal].nbl_lists,
923 nbv->grp[eintLocal].kernel_type,
925 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
926 if (nbv->listParams->useDynamicPruning && !bUseGPU)
928 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
930 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
934 /* initialize local pair-list on the GPU */
935 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
936 nbv->grp[eintLocal].nbl_lists.nbl[0],
939 wallcycle_stop(wcycle, ewcNS);
943 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
944 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
945 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
946 nbv->grp[eintLocal].nbat);
947 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
948 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
953 if (DOMAINDECOMP(cr))
955 ddOpenBalanceRegionGpu(cr->dd);
958 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
959 /* launch local nonbonded F on GPU */
960 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
962 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
965 /* Communicate coordinates and sum dipole if necessary +
966 do non-local pair search */
967 if (DOMAINDECOMP(cr))
969 bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
970 nbv->grp[eintLocal].kernel_type);
974 /* With GPU+CPU non-bonded calculations we need to copy
975 * the local coordinates to the non-local nbat struct
976 * (in CPU format) as the non-local kernel call also
977 * calculates the local - non-local interactions.
979 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
980 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
981 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
982 nbv->grp[eintNonlocal].nbat);
983 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
984 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
989 wallcycle_start_nocount(wcycle, ewcNS);
990 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
994 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
997 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
999 nbv->listParams->rlistOuter,
1000 nbv->min_ci_balanced,
1001 &nbv->grp[eintNonlocal].nbl_lists,
1003 nbv->grp[eintNonlocal].kernel_type,
1005 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1006 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1008 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1010 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1012 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1014 /* initialize non-local pair-list on the GPU */
1015 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1016 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1019 wallcycle_stop(wcycle, ewcNS);
1023 wallcycle_start(wcycle, ewcMOVEX);
1024 dd_move_x(cr->dd, box, x);
1025 wallcycle_stop(wcycle, ewcMOVEX);
1027 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1028 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1029 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1030 nbv->grp[eintNonlocal].nbat);
1031 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1032 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1035 if (bUseGPU && !bDiffKernels)
1037 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1038 /* launch non-local nonbonded F on GPU */
1039 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1040 step, nrnb, wcycle);
1041 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1047 /* launch D2H copy-back F */
1048 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1049 if (DOMAINDECOMP(cr) && !bDiffKernels)
1051 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1052 flags, eatNonlocal);
1054 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1056 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1059 if (bStateChanged && inputrecNeedMutot(inputrec))
1063 gmx_sumd(2*DIM, mu, cr);
1064 ddReopenBalanceRegionCpu(cr->dd);
1067 for (i = 0; i < 2; i++)
1069 for (j = 0; j < DIM; j++)
1071 fr->mu_tot[i][j] = mu[i*DIM + j];
1075 if (fr->efep == efepNO)
1077 copy_rvec(fr->mu_tot[0], mu_tot);
1081 for (j = 0; j < DIM; j++)
1084 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1085 lambda[efptCOUL]*fr->mu_tot[1][j];
1089 /* Reset energies */
1090 reset_enerdata(enerd);
1091 clear_rvecs(SHIFTS, fr->fshift);
1093 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1095 wallcycle_start(wcycle, ewcPPDURINGPME);
1096 dd_force_flop_start(cr->dd, nrnb);
1101 wallcycle_start(wcycle, ewcROT);
1102 do_rotation(cr, inputrec, box, x, t, step, bNS);
1103 wallcycle_stop(wcycle, ewcROT);
1106 /* Temporary solution until all routines take PaddedRVecVector */
1107 rvec *f = as_rvec_array(force->data());
1109 /* Start the force cycle counter.
1110 * Note that a different counter is used for dynamic load balancing.
1112 wallcycle_start(wcycle, ewcFORCE);
1115 /* Reset forces for which the virial is calculated separately:
1116 * PME/Ewald forces if necessary */
1117 if (fr->bF_NoVirSum)
1119 if (flags & GMX_FORCE_VIRIAL)
1121 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1125 /* We are not calculating the pressure so we do not need
1126 * a separate array for forces that do not contribute
1129 fr->f_novirsum = force;
1133 if (fr->bF_NoVirSum)
1135 if (flags & GMX_FORCE_VIRIAL)
1137 /* TODO: remove this - 1 when padding is properly implemented */
1138 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1139 as_rvec_array(fr->f_novirsum->data()));
1142 /* Clear the short- and long-range forces */
1143 clear_rvecs_omp(fr->natoms_force_constr, f);
1145 clear_rvec(fr->vir_diag_posres);
1148 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1150 clear_pull_forces(inputrec->pull_work);
1153 /* We calculate the non-bonded forces, when done on the CPU, here.
1154 * We do this before calling do_force_lowlevel, because in that
1155 * function, the listed forces are calculated before PME, which
1156 * does communication. With this order, non-bonded and listed
1157 * force calculation imbalance can be balanced out by the domain
1158 * decomposition load balancing.
1163 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1164 step, nrnb, wcycle);
1167 if (fr->efep != efepNO)
1169 /* Calculate the local and non-local free energy interactions here.
1170 * Happens here on the CPU both with and without GPU.
1172 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1174 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1176 inputrec->fepvals, lambda,
1177 enerd, flags, nrnb, wcycle);
1180 if (DOMAINDECOMP(cr) &&
1181 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1183 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1185 inputrec->fepvals, lambda,
1186 enerd, flags, nrnb, wcycle);
1190 if (!bUseOrEmulGPU || bDiffKernels)
1194 if (DOMAINDECOMP(cr))
1196 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1197 bDiffKernels ? enbvClearFYes : enbvClearFNo,
1198 step, nrnb, wcycle);
1207 aloc = eintNonlocal;
1210 /* Add all the non-bonded force to the normal force array.
1211 * This can be split into a local and a non-local part when overlapping
1212 * communication with calculation with domain decomposition.
1214 wallcycle_stop(wcycle, ewcFORCE);
1215 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1216 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1217 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1218 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1219 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1220 wallcycle_start_nocount(wcycle, ewcFORCE);
1222 /* if there are multiple fshift output buffers reduce them */
1223 if ((flags & GMX_FORCE_VIRIAL) &&
1224 nbv->grp[aloc].nbl_lists.nnbl > 1)
1226 /* This is not in a subcounter because it takes a
1227 negligible and constant-sized amount of time */
1228 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1233 /* update QMMMrec, if necessary */
1236 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1239 /* Compute the bonded and non-bonded energies and optionally forces */
1240 do_force_lowlevel(fr, inputrec, &(top->idef),
1241 cr, nrnb, wcycle, mdatoms,
1242 x, hist, f, enerd, fcd, top, fr->born,
1244 inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1245 flags, &cycles_pme);
1247 wallcycle_stop(wcycle, ewcFORCE);
1251 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1254 if (bUseOrEmulGPU && !bDiffKernels)
1256 /* wait for non-local forces (or calculate in emulation mode) */
1257 if (DOMAINDECOMP(cr))
1261 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1262 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1264 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1266 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1270 wallcycle_start_nocount(wcycle, ewcFORCE);
1271 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1272 step, nrnb, wcycle);
1273 wallcycle_stop(wcycle, ewcFORCE);
1275 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1276 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1277 /* skip the reduction if there was no non-local work to do */
1278 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1280 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1281 nbv->grp[eintNonlocal].nbat, f);
1283 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1284 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1288 if (DOMAINDECOMP(cr))
1290 /* We are done with the CPU compute.
1291 * We will now communicate the non-local forces.
1292 * If we use a GPU this will overlap with GPU work, so in that case
1293 * we do not close the DD force balancing region here.
1295 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1297 ddCloseBalanceRegionCpu(cr->dd);
1301 wallcycle_start(wcycle, ewcMOVEF);
1302 dd_move_f(cr->dd, f, fr->fshift);
1303 wallcycle_stop(wcycle, ewcMOVEF);
1309 /* wait for local forces (or calculate in emulation mode) */
1312 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1313 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1314 * but even with a step of 0.1 ms the difference is less than 1%
1317 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1319 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1320 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1322 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1324 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1326 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1328 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1329 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1331 /* We measured few cycles, it could be that the kernel
1332 * and transfer finished earlier and there was no actual
1333 * wait time, only API call overhead.
1334 * Then the actual time could be anywhere between 0 and
1335 * cycles_wait_est. We will use half of cycles_wait_est.
1337 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1339 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1342 /* now clear the GPU outputs while we finish the step on the CPU */
1343 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1344 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1346 /* Is dynamic pair-list pruning activated? */
1347 if (nbv->listParams->useDynamicPruning)
1349 /* We should not launch the rolling pruning kernel at a search
1350 * step or just before search steps, since that's useless.
1351 * Without domain decomposition we prune at even steps.
1352 * With domain decomposition we alternate local and non-local
1353 * pruning at even and odd steps.
1355 int numRollingParts = nbv->listParams->numRollingParts;
1356 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1357 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1358 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1359 if (stepWithCurrentList > 0 &&
1360 stepWithCurrentList < inputrec->nstlist - 1 &&
1361 (stepIsEven || DOMAINDECOMP(cr)))
1363 nbnxn_gpu_launch_kernel_pruneonly(fr->nbv->gpu_nbv,
1364 stepIsEven ? eintLocal : eintNonlocal,
1368 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1372 wallcycle_start_nocount(wcycle, ewcFORCE);
1373 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1374 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1375 step, nrnb, wcycle);
1376 wallcycle_stop(wcycle, ewcFORCE);
1378 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1379 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1380 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1381 nbv->grp[eintLocal].nbat, f);
1382 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1383 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1386 if (DOMAINDECOMP(cr))
1388 dd_force_flop_stop(cr->dd, nrnb);
1393 /* Collect forces from modules */
1394 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1395 if (fr->bF_NoVirSum)
1397 fNoVirSum = *fr->f_novirsum;
1399 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1401 /* If we have NoVirSum forces, but we do not calculate the virial,
1402 * we sum fr->f_novirsum=f later.
1404 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1406 wallcycle_start(wcycle, ewcVSITESPREAD);
1407 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1408 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1409 wallcycle_stop(wcycle, ewcVSITESPREAD);
1412 if (flags & GMX_FORCE_VIRIAL)
1414 /* Calculation of the virial must be done after vsites! */
1415 calc_virial(0, mdatoms->homenr, x, f,
1416 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1420 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1422 /* Since the COM pulling is always done mass-weighted, no forces are
1423 * applied to vsites and this call can be done after vsite spreading.
1425 pull_potential_wrapper(cr, inputrec, box, x,
1426 f, vir_force, mdatoms, enerd, lambda, t,
1430 /* Add the forces from enforced rotation potentials (if any) */
1433 wallcycle_start(wcycle, ewcROTadd);
1434 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1435 wallcycle_stop(wcycle, ewcROTadd);
1438 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1439 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1441 if (PAR(cr) && !(cr->duty & DUTY_PME))
1443 /* In case of node-splitting, the PP nodes receive the long-range
1444 * forces, virial and energy from the PME nodes here.
1446 pme_receive_force_ener(cr, wcycle, enerd, fr);
1451 post_process_forces(cr, step, nrnb, wcycle,
1452 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1456 if (flags & GMX_FORCE_ENERGY)
1458 /* Sum the potential energy terms from group contributions */
1459 sum_epot(&(enerd->grpp), enerd->term);
1461 if (!EI_TPI(inputrec->eI))
1463 checkPotentialEnergyValidity(enerd);
1468 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1469 t_inputrec *inputrec,
1470 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1471 gmx_localtop_t *top,
1472 gmx_groups_t *groups,
1473 matrix box, rvec x[], history_t *hist,
1474 PaddedRVecVector *force,
1477 gmx_enerdata_t *enerd, t_fcdata *fcd,
1478 real *lambda, t_graph *graph,
1479 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1480 double t, gmx_edsam_t ed,
1481 gmx_bool bBornRadii,
1483 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1484 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1488 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1492 const int start = 0;
1493 const int homenr = mdatoms->homenr;
1495 clear_mat(vir_force);
1498 if (DOMAINDECOMP(cr))
1500 cg1 = cr->dd->ncg_tot;
1511 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1512 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1513 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1514 bFillGrid = (bNS && bStateChanged);
1515 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1516 bDoForces = (flags & GMX_FORCE_FORCES);
1520 update_forcerec(fr, box);
1522 if (inputrecNeedMutot(inputrec))
1524 /* Calculate total (local) dipole moment in a temporary common array.
1525 * This makes it possible to sum them over nodes faster.
1527 calc_mu(start, homenr,
1528 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1533 if (fr->ePBC != epbcNONE)
1535 /* Compute shift vectors every step,
1536 * because of pressure coupling or box deformation!
1538 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1540 calc_shifts(box, fr->shift_vec);
1545 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1546 &(top->cgs), x, fr->cg_cm);
1547 inc_nrnb(nrnb, eNR_CGCM, homenr);
1548 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1550 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1552 unshift_self(graph, box, x);
1557 calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1558 inc_nrnb(nrnb, eNR_CGCM, homenr);
1561 if (bCalcCGCM && gmx_debug_at)
1563 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1567 if (!(cr->duty & DUTY_PME))
1572 /* Send particle coordinates to the pme nodes.
1573 * Since this is only implemented for domain decomposition
1574 * and domain decomposition does not use the graph,
1575 * we do not need to worry about shifting.
1578 wallcycle_start(wcycle, ewcPP_PMESENDX);
1580 bBS = (inputrec->nwall == 2);
1583 copy_mat(box, boxs);
1584 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1587 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1588 lambda[efptCOUL], lambda[efptVDW],
1589 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1592 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1594 #endif /* GMX_MPI */
1596 /* Communicate coordinates and sum dipole if necessary */
1597 if (DOMAINDECOMP(cr))
1599 wallcycle_start(wcycle, ewcMOVEX);
1600 dd_move_x(cr->dd, box, x);
1601 wallcycle_stop(wcycle, ewcMOVEX);
1602 /* No GPU support, no move_x overlap, so reopen the balance region here */
1603 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1605 ddReopenBalanceRegionCpu(cr->dd);
1609 if (inputrecNeedMutot(inputrec))
1615 gmx_sumd(2*DIM, mu, cr);
1616 ddReopenBalanceRegionCpu(cr->dd);
1618 for (i = 0; i < 2; i++)
1620 for (j = 0; j < DIM; j++)
1622 fr->mu_tot[i][j] = mu[i*DIM + j];
1626 if (fr->efep == efepNO)
1628 copy_rvec(fr->mu_tot[0], mu_tot);
1632 for (j = 0; j < DIM; j++)
1635 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1640 /* Reset energies */
1641 reset_enerdata(enerd);
1642 clear_rvecs(SHIFTS, fr->fshift);
1646 wallcycle_start(wcycle, ewcNS);
1648 if (graph && bStateChanged)
1650 /* Calculate intramolecular shift vectors to make molecules whole */
1651 mk_mshift(fplog, graph, fr->ePBC, box, x);
1654 /* Do the actual neighbour searching */
1656 groups, top, mdatoms,
1657 cr, nrnb, bFillGrid);
1659 wallcycle_stop(wcycle, ewcNS);
1662 if (inputrec->implicit_solvent && bNS)
1664 make_gb_nblist(cr, inputrec->gb_algorithm,
1665 x, box, fr, &top->idef, graph, fr->born);
1668 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1670 wallcycle_start(wcycle, ewcPPDURINGPME);
1671 dd_force_flop_start(cr->dd, nrnb);
1676 wallcycle_start(wcycle, ewcROT);
1677 do_rotation(cr, inputrec, box, x, t, step, bNS);
1678 wallcycle_stop(wcycle, ewcROT);
1681 /* Temporary solution until all routines take PaddedRVecVector */
1682 rvec *f = as_rvec_array(force->data());
1684 /* Start the force cycle counter.
1685 * Note that a different counter is used for dynamic load balancing.
1687 wallcycle_start(wcycle, ewcFORCE);
1691 /* Reset forces for which the virial is calculated separately:
1692 * PME/Ewald forces if necessary */
1693 if (fr->bF_NoVirSum)
1695 if (flags & GMX_FORCE_VIRIAL)
1697 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1698 /* TODO: remove this - 1 when padding is properly implemented */
1699 clear_rvecs(fr->f_novirsum->size() - 1,
1700 as_rvec_array(fr->f_novirsum->data()));
1704 /* We are not calculating the pressure so we do not need
1705 * a separate array for forces that do not contribute
1708 fr->f_novirsum = force;
1712 /* Clear the short- and long-range forces */
1713 clear_rvecs(fr->natoms_force_constr, f);
1715 clear_rvec(fr->vir_diag_posres);
1717 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1719 clear_pull_forces(inputrec->pull_work);
1722 /* update QMMMrec, if necessary */
1725 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1728 /* Compute the bonded and non-bonded energies and optionally forces */
1729 do_force_lowlevel(fr, inputrec, &(top->idef),
1730 cr, nrnb, wcycle, mdatoms,
1731 x, hist, f, enerd, fcd, top, fr->born,
1733 inputrec->fepvals, lambda,
1734 graph, &(top->excls), fr->mu_tot,
1738 wallcycle_stop(wcycle, ewcFORCE);
1740 if (DOMAINDECOMP(cr))
1742 dd_force_flop_stop(cr->dd, nrnb);
1744 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1746 ddCloseBalanceRegionCpu(cr->dd);
1752 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1757 /* Collect forces from modules */
1758 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1759 if (fr->bF_NoVirSum)
1761 fNoVirSum = *fr->f_novirsum;
1763 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1765 /* Communicate the forces */
1766 if (DOMAINDECOMP(cr))
1768 wallcycle_start(wcycle, ewcMOVEF);
1769 dd_move_f(cr->dd, f, fr->fshift);
1770 /* Do we need to communicate the separate force array
1771 * for terms that do not contribute to the single sum virial?
1772 * Position restraints and electric fields do not introduce
1773 * inter-cg forces, only full electrostatics methods do.
1774 * When we do not calculate the virial, fr->f_novirsum = f,
1775 * so we have already communicated these forces.
1777 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1778 (flags & GMX_FORCE_VIRIAL))
1780 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1782 wallcycle_stop(wcycle, ewcMOVEF);
1785 /* If we have NoVirSum forces, but we do not calculate the virial,
1786 * we sum fr->f_novirsum=f later.
1788 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1790 wallcycle_start(wcycle, ewcVSITESPREAD);
1791 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1792 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1793 wallcycle_stop(wcycle, ewcVSITESPREAD);
1796 if (flags & GMX_FORCE_VIRIAL)
1798 /* Calculation of the virial must be done after vsites! */
1799 calc_virial(0, mdatoms->homenr, x, f,
1800 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1804 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1806 pull_potential_wrapper(cr, inputrec, box, x,
1807 f, vir_force, mdatoms, enerd, lambda, t,
1811 /* Add the forces from enforced rotation potentials (if any) */
1814 wallcycle_start(wcycle, ewcROTadd);
1815 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1816 wallcycle_stop(wcycle, ewcROTadd);
1819 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1820 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1822 if (PAR(cr) && !(cr->duty & DUTY_PME))
1824 /* In case of node-splitting, the PP nodes receive the long-range
1825 * forces, virial and energy from the PME nodes here.
1827 pme_receive_force_ener(cr, wcycle, enerd, fr);
1832 post_process_forces(cr, step, nrnb, wcycle,
1833 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1837 if (flags & GMX_FORCE_ENERGY)
1839 /* Sum the potential energy terms from group contributions */
1840 sum_epot(&(enerd->grpp), enerd->term);
1842 if (!EI_TPI(inputrec->eI))
1844 checkPotentialEnergyValidity(enerd);
1850 void do_force(FILE *fplog, t_commrec *cr,
1851 t_inputrec *inputrec,
1852 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1853 gmx_localtop_t *top,
1854 gmx_groups_t *groups,
1855 matrix box, PaddedRVecVector *coordinates, history_t *hist,
1856 PaddedRVecVector *force,
1859 gmx_enerdata_t *enerd, t_fcdata *fcd,
1860 gmx::ArrayRef<real> lambda, t_graph *graph,
1862 gmx_vsite_t *vsite, rvec mu_tot,
1863 double t, gmx_edsam_t ed,
1864 gmx_bool bBornRadii,
1866 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1867 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1869 /* modify force flag if not doing nonbonded */
1870 if (!fr->bNonbonded)
1872 flags &= ~GMX_FORCE_NONBONDED;
1875 GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1876 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1877 GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1878 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1880 rvec *x = as_rvec_array(coordinates->data());
1882 switch (inputrec->cutoff_scheme)
1885 do_force_cutsVERLET(fplog, cr, inputrec,
1893 lambda.data(), graph,
1899 ddOpenBalanceRegion,
1900 ddCloseBalanceRegion);
1903 do_force_cutsGROUP(fplog, cr, inputrec,
1911 lambda.data(), graph,
1916 ddOpenBalanceRegion,
1917 ddCloseBalanceRegion);
1920 gmx_incons("Invalid cut-off scheme passed!");
1923 /* In case we don't have constraints and are using GPUs, the next balancing
1924 * region starts here.
1925 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1926 * virial calculation and COM pulling, is not thus not included in
1927 * the balance timing, which is ok as most tasks do communication.
1929 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1931 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1936 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1937 t_inputrec *ir, t_mdatoms *md,
1938 t_state *state, t_commrec *cr, t_nrnb *nrnb,
1939 t_forcerec *fr, gmx_localtop_t *top)
1941 int i, m, start, end;
1943 real dt = ir->delta_t;
1947 /* We need to allocate one element extra, since we might use
1948 * (unaligned) 4-wide SIMD loads to access rvec entries.
1950 snew(savex, state->natoms + 1);
1957 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1958 start, md->homenr, end);
1960 /* Do a first constrain to reset particles... */
1961 step = ir->init_step;
1964 char buf[STEPSTRSIZE];
1965 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1966 gmx_step_str(step, buf));
1970 /* constrain the current position */
1971 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1972 ir, cr, step, 0, 1.0, md,
1973 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1974 fr->bMolPBC, state->box,
1975 state->lambda[efptBONDED], &dvdl_dum,
1976 nullptr, nullptr, nrnb, econqCoord);
1979 /* constrain the inital velocity, and save it */
1980 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1981 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1982 ir, cr, step, 0, 1.0, md,
1983 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1984 fr->bMolPBC, state->box,
1985 state->lambda[efptBONDED], &dvdl_dum,
1986 nullptr, nullptr, nrnb, econqVeloc);
1988 /* constrain the inital velocities at t-dt/2 */
1989 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1991 for (i = start; (i < end); i++)
1993 for (m = 0; (m < DIM); m++)
1995 /* Reverse the velocity */
1996 state->v[i][m] = -state->v[i][m];
1997 /* Store the position at t-dt in buf */
1998 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
2001 /* Shake the positions at t=-dt with the positions at t=0
2002 * as reference coordinates.
2006 char buf[STEPSTRSIZE];
2007 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2008 gmx_step_str(step, buf));
2011 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2012 ir, cr, step, -1, 1.0, md,
2013 as_rvec_array(state->x.data()), savex, nullptr,
2014 fr->bMolPBC, state->box,
2015 state->lambda[efptBONDED], &dvdl_dum,
2016 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2018 for (i = start; i < end; i++)
2020 for (m = 0; m < DIM; m++)
2022 /* Re-reverse the velocities */
2023 state->v[i][m] = -state->v[i][m];
2032 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2033 double *enerout, double *virout)
2035 double enersum, virsum;
2036 double invscale, invscale2, invscale3;
2037 double r, ea, eb, ec, pa, pb, pc, pd;
2042 invscale = 1.0/scale;
2043 invscale2 = invscale*invscale;
2044 invscale3 = invscale*invscale2;
2046 /* Following summation derived from cubic spline definition,
2047 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2048 * the cubic spline. We first calculate the negative of the
2049 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2050 * add the more standard, abrupt cutoff correction to that result,
2051 * yielding the long-range correction for a switched function. We
2052 * perform both the pressure and energy loops at the same time for
2053 * simplicity, as the computational cost is low. */
2057 /* Since the dispersion table has been scaled down a factor
2058 * 6.0 and the repulsion a factor 12.0 to compensate for the
2059 * c6/c12 parameters inside nbfp[] being scaled up (to save
2060 * flops in kernels), we need to correct for this.
2071 for (ri = rstart; ri < rend; ++ri)
2075 eb = 2.0*invscale2*r;
2079 pb = 3.0*invscale2*r;
2080 pc = 3.0*invscale*r*r;
2083 /* this "8" is from the packing in the vdwtab array - perhaps
2084 should be defined? */
2086 offset = 8*ri + offstart;
2087 y0 = vdwtab[offset];
2088 f = vdwtab[offset+1];
2089 g = vdwtab[offset+2];
2090 h = vdwtab[offset+3];
2092 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2093 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2095 *enerout = 4.0*M_PI*enersum*tabfactor;
2096 *virout = 4.0*M_PI*virsum*tabfactor;
2099 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2101 double eners[2], virs[2], enersum, virsum;
2102 double r0, rc3, rc9;
2104 real scale, *vdwtab;
2106 fr->enershiftsix = 0;
2107 fr->enershifttwelve = 0;
2108 fr->enerdiffsix = 0;
2109 fr->enerdifftwelve = 0;
2111 fr->virdifftwelve = 0;
2113 if (eDispCorr != edispcNO)
2115 for (i = 0; i < 2; i++)
2120 if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2121 (fr->vdw_modifier == eintmodPOTSWITCH) ||
2122 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2123 (fr->vdwtype == evdwSHIFT) ||
2124 (fr->vdwtype == evdwSWITCH))
2126 if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2127 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2128 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2131 "With dispersion correction rvdw-switch can not be zero "
2132 "for vdw-type = %s", evdw_names[fr->vdwtype]);
2135 /* TODO This code depends on the logic in tables.c that
2136 constructs the table layout, which should be made
2137 explicit in future cleanup. */
2138 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2139 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2140 scale = fr->dispersionCorrectionTable->scale;
2141 vdwtab = fr->dispersionCorrectionTable->data;
2143 /* Round the cut-offs to exact table values for precision */
2144 ri0 = static_cast<int>(floor(fr->rvdw_switch*scale));
2145 ri1 = static_cast<int>(ceil(fr->rvdw*scale));
2147 /* The code below has some support for handling force-switching, i.e.
2148 * when the force (instead of potential) is switched over a limited
2149 * region. This leads to a constant shift in the potential inside the
2150 * switching region, which we can handle by adding a constant energy
2151 * term in the force-switch case just like when we do potential-shift.
2153 * For now this is not enabled, but to keep the functionality in the
2154 * code we check separately for switch and shift. When we do force-switch
2155 * the shifting point is rvdw_switch, while it is the cutoff when we
2156 * have a classical potential-shift.
2158 * For a pure potential-shift the potential has a constant shift
2159 * all the way out to the cutoff, and that is it. For other forms
2160 * we need to calculate the constant shift up to the point where we
2161 * start modifying the potential.
2163 ri0 = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2169 if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2170 (fr->vdwtype == evdwSHIFT))
2172 /* Determine the constant energy shift below rvdw_switch.
2173 * Table has a scale factor since we have scaled it down to compensate
2174 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2176 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2177 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2179 else if (fr->vdw_modifier == eintmodPOTSHIFT)
2181 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2182 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2185 /* Add the constant part from 0 to rvdw_switch.
2186 * This integration from 0 to rvdw_switch overcounts the number
2187 * of interactions by 1, as it also counts the self interaction.
2188 * We will correct for this later.
2190 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2191 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2193 /* Calculate the contribution in the range [r0,r1] where we
2194 * modify the potential. For a pure potential-shift modifier we will
2195 * have ri0==ri1, and there will not be any contribution here.
2197 for (i = 0; i < 2; i++)
2201 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2202 eners[i] -= enersum;
2206 /* Alright: Above we compensated by REMOVING the parts outside r0
2207 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2209 * Regardless of whether r0 is the point where we start switching,
2210 * or the cutoff where we calculated the constant shift, we include
2211 * all the parts we are missing out to infinity from r0 by
2212 * calculating the analytical dispersion correction.
2214 eners[0] += -4.0*M_PI/(3.0*rc3);
2215 eners[1] += 4.0*M_PI/(9.0*rc9);
2216 virs[0] += 8.0*M_PI/rc3;
2217 virs[1] += -16.0*M_PI/(3.0*rc9);
2219 else if (fr->vdwtype == evdwCUT ||
2220 EVDW_PME(fr->vdwtype) ||
2221 fr->vdwtype == evdwUSER)
2223 if (fr->vdwtype == evdwUSER && fplog)
2226 "WARNING: using dispersion correction with user tables\n");
2229 /* Note that with LJ-PME, the dispersion correction is multiplied
2230 * by the difference between the actual C6 and the value of C6
2231 * that would produce the combination rule.
2232 * This means the normal energy and virial difference formulas
2236 rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
2238 /* Contribution beyond the cut-off */
2239 eners[0] += -4.0*M_PI/(3.0*rc3);
2240 eners[1] += 4.0*M_PI/(9.0*rc9);
2241 if (fr->vdw_modifier == eintmodPOTSHIFT)
2243 /* Contribution within the cut-off */
2244 eners[0] += -4.0*M_PI/(3.0*rc3);
2245 eners[1] += 4.0*M_PI/(3.0*rc9);
2247 /* Contribution beyond the cut-off */
2248 virs[0] += 8.0*M_PI/rc3;
2249 virs[1] += -16.0*M_PI/(3.0*rc9);
2254 "Dispersion correction is not implemented for vdw-type = %s",
2255 evdw_names[fr->vdwtype]);
2258 /* When we deprecate the group kernels the code below can go too */
2259 if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2261 /* Calculate self-interaction coefficient (assuming that
2262 * the reciprocal-space contribution is constant in the
2263 * region that contributes to the self-interaction).
2265 fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2267 eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2268 virs[0] += gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2271 fr->enerdiffsix = eners[0];
2272 fr->enerdifftwelve = eners[1];
2273 /* The 0.5 is due to the Gromacs definition of the virial */
2274 fr->virdiffsix = 0.5*virs[0];
2275 fr->virdifftwelve = 0.5*virs[1];
2279 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2280 matrix box, real lambda, tensor pres, tensor virial,
2281 real *prescorr, real *enercorr, real *dvdlcorr)
2283 gmx_bool bCorrAll, bCorrPres;
2284 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2294 if (ir->eDispCorr != edispcNO)
2296 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2297 ir->eDispCorr == edispcAllEnerPres);
2298 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2299 ir->eDispCorr == edispcAllEnerPres);
2301 invvol = 1/det(box);
2304 /* Only correct for the interactions with the inserted molecule */
2305 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2310 dens = fr->numAtomsForDispersionCorrection*invvol;
2311 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2314 if (ir->efep == efepNO)
2316 avcsix = fr->avcsix[0];
2317 avctwelve = fr->avctwelve[0];
2321 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2322 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2325 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2326 *enercorr += avcsix*enerdiff;
2328 if (ir->efep != efepNO)
2330 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2334 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2335 *enercorr += avctwelve*enerdiff;
2336 if (fr->efep != efepNO)
2338 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2344 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2345 if (ir->eDispCorr == edispcAllEnerPres)
2347 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2349 /* The factor 2 is because of the Gromacs virial definition */
2350 spres = -2.0*invvol*svir*PRESFAC;
2352 for (m = 0; m < DIM; m++)
2354 virial[m][m] += svir;
2355 pres[m][m] += spres;
2360 /* Can't currently control when it prints, for now, just print when degugging */
2365 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2371 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2372 *enercorr, spres, svir);
2376 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2380 if (fr->efep != efepNO)
2382 *dvdlcorr += dvdlambda;
2387 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2388 const gmx_mtop_t *mtop, rvec x[],
2393 gmx_molblock_t *molb;
2395 if (bFirst && fplog)
2397 fprintf(fplog, "Removing pbc first time\n");
2402 for (mb = 0; mb < mtop->nmolblock; mb++)
2404 molb = &mtop->molblock[mb];
2405 if (molb->natoms_mol == 1 ||
2406 (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2408 /* Just one atom or charge group in the molecule, no PBC required */
2409 as += molb->nmol*molb->natoms_mol;
2413 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2414 mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2415 0, molb->natoms_mol, FALSE, FALSE, graph);
2417 for (mol = 0; mol < molb->nmol; mol++)
2419 mk_mshift(fplog, graph, ePBC, box, x+as);
2421 shift_self(graph, box, x+as);
2422 /* The molecule is whole now.
2423 * We don't need the second mk_mshift call as in do_pbc_first,
2424 * since we no longer need this graph.
2427 as += molb->natoms_mol;
2435 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2436 const gmx_mtop_t *mtop, rvec x[])
2438 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2441 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2442 gmx_mtop_t *mtop, rvec x[])
2444 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2447 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2450 nth = gmx_omp_nthreads_get(emntDefault);
2452 #pragma omp parallel for num_threads(nth) schedule(static)
2453 for (t = 0; t < nth; t++)
2459 offset = (natoms*t )/nth;
2460 len = (natoms*(t + 1))/nth - offset;
2461 put_atoms_in_box(ePBC, box, len, x + offset);
2463 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2467 // TODO This can be cleaned up a lot, and move back to runner.cpp
2468 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2469 const t_inputrec *inputrec,
2470 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2471 gmx_walltime_accounting_t walltime_accounting,
2472 nonbonded_verlet_t *nbv,
2473 gmx_bool bWriteStat)
2475 t_nrnb *nrnb_tot = nullptr;
2477 double nbfs = 0, mflop = 0;
2478 double elapsed_time,
2479 elapsed_time_over_all_ranks,
2480 elapsed_time_over_all_threads,
2481 elapsed_time_over_all_threads_over_all_ranks;
2482 /* Control whether it is valid to print a report. Only the
2483 simulation master may print, but it should not do so if the run
2484 terminated e.g. before a scheduled reset step. This is
2485 complicated by the fact that PME ranks are unaware of the
2486 reason why they were sent a pmerecvqxFINISH. To avoid
2487 communication deadlocks, we always do the communication for the
2488 report, even if we've decided not to write the report, because
2489 how long it takes to finish the run is not important when we've
2490 decided not to report on the simulation performance. */
2491 bool printReport = SIMMASTER(cr);
2493 if (!walltime_accounting_get_valid_finish(walltime_accounting))
2495 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2496 printReport = false;
2503 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2504 cr->mpi_comm_mysim);
2512 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2513 elapsed_time_over_all_ranks = elapsed_time;
2514 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2515 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2519 /* reduce elapsed_time over all MPI ranks in the current simulation */
2520 MPI_Allreduce(&elapsed_time,
2521 &elapsed_time_over_all_ranks,
2522 1, MPI_DOUBLE, MPI_SUM,
2523 cr->mpi_comm_mysim);
2524 elapsed_time_over_all_ranks /= cr->nnodes;
2525 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2526 * current simulation. */
2527 MPI_Allreduce(&elapsed_time_over_all_threads,
2528 &elapsed_time_over_all_threads_over_all_ranks,
2529 1, MPI_DOUBLE, MPI_SUM,
2530 cr->mpi_comm_mysim);
2536 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2543 if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2545 print_dd_statistics(cr, inputrec, fplog);
2548 /* TODO Move the responsibility for any scaling by thread counts
2549 * to the code that handled the thread region, so that there's a
2550 * mechanism to keep cycle counting working during the transition
2551 * to task parallelism. */
2552 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2553 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2554 wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2555 auto cycle_sum(wallcycle_sum(cr, wcycle));
2559 struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2561 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2562 elapsed_time_over_all_ranks,
2563 wcycle, cycle_sum, gputimes);
2565 if (EI_DYNAMICS(inputrec->eI))
2567 delta_t = inputrec->delta_t;
2572 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2573 elapsed_time_over_all_ranks,
2574 walltime_accounting_get_nsteps_done(walltime_accounting),
2575 delta_t, nbfs, mflop);
2579 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2580 elapsed_time_over_all_ranks,
2581 walltime_accounting_get_nsteps_done(walltime_accounting),
2582 delta_t, nbfs, mflop);
2587 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2589 /* this function works, but could probably use a logic rewrite to keep all the different
2590 types of efep straight. */
2592 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2597 t_lambda *fep = ir->fepvals;
2598 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2599 if checkpoint is set -- a kludge is in for now
2602 for (int i = 0; i < efptNR; i++)
2604 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2605 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2607 lambda[i] = fep->init_lambda;
2610 lam0[i] = lambda[i];
2615 lambda[i] = fep->all_lambda[i][*fep_state];
2618 lam0[i] = lambda[i];
2624 /* need to rescale control temperatures to match current state */
2625 for (int i = 0; i < ir->opts.ngtc; i++)
2627 if (ir->opts.ref_t[i] > 0)
2629 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2634 /* Send to the log the information on the current lambdas */
2635 if (fplog != nullptr)
2637 fprintf(fplog, "Initial vector of lambda components:[ ");
2638 for (const auto &l : lambda)
2640 fprintf(fplog, "%10.4f ", l);
2642 fprintf(fplog, "]\n");
2648 void init_md(FILE *fplog,
2649 t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2650 t_inputrec *ir, const gmx_output_env_t *oenv,
2651 double *t, double *t0,
2652 gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2653 t_nrnb *nrnb, gmx_mtop_t *mtop,
2655 int nfile, const t_filenm fnm[],
2656 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2657 tensor force_vir, tensor shake_vir, rvec mu_tot,
2658 gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2659 gmx_wallcycle_t wcycle)
2663 /* Initial values */
2664 *t = *t0 = ir->init_t;
2667 for (i = 0; i < ir->opts.ngtc; i++)
2669 /* set bSimAnn if any group is being annealed */
2670 if (ir->opts.annealing[i] != eannNO)
2676 /* Initialize lambda variables */
2677 initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2679 // TODO upd is never NULL in practice, but the analysers don't know that
2682 *upd = init_update(ir);
2686 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2691 *vcm = init_vcm(fplog, &mtop->groups, ir);
2694 if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2696 if (ir->etc == etcBERENDSEN)
2698 please_cite(fplog, "Berendsen84a");
2700 if (ir->etc == etcVRESCALE)
2702 please_cite(fplog, "Bussi2007a");
2704 if (ir->eI == eiSD1)
2706 please_cite(fplog, "Goga2012");
2713 *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2715 *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2716 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2719 /* Initiate variables */
2720 clear_mat(force_vir);
2721 clear_mat(shake_vir);