2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
116 void print_time(FILE *out,
117 gmx_walltime_accounting_t walltime_accounting,
120 t_commrec gmx_unused *cr)
123 char timebuf[STRLEN];
124 double dt, elapsed_seconds, time_per_step;
133 fprintf(out, "step %s", gmx_step_str(step, buf));
136 if ((step >= ir->nstlist))
138 double seconds_since_epoch = gmx_gettime();
139 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
140 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
141 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
147 finish = (time_t) (seconds_since_epoch + dt);
148 gmx_ctime_r(&finish, timebuf, STRLEN);
149 sprintf(buf, "%s", timebuf);
150 buf[strlen(buf)-1] = '\0';
151 fprintf(out, ", will finish %s", buf);
155 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
160 fprintf(out, " performance: %.1f ns/day ",
161 ir->delta_t/1000*24*60*60/time_per_step);
174 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
177 char time_string[STRLEN];
186 char timebuf[STRLEN];
187 time_t temp_time = (time_t) the_time;
189 gmx_ctime_r(&temp_time, timebuf, STRLEN);
190 for (i = 0; timebuf[i] >= ' '; i++)
192 time_string[i] = timebuf[i];
194 time_string[i] = '\0';
197 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
200 void print_start(FILE *fplog, t_commrec *cr,
201 gmx_walltime_accounting_t walltime_accounting,
206 sprintf(buf, "Started %s", name);
207 print_date_and_time(fplog, cr->nodeid, buf,
208 walltime_accounting_get_start_time_stamp(walltime_accounting));
211 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
213 /* TODO: remove this - 1 when padding is properly implemented */
214 int end = forceToAdd->size() - 1;
215 const rvec *fAdd = as_rvec_array(forceToAdd->data());
217 // cppcheck-suppress unreadVariable
218 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220 for (int i = 0; i < end; i++)
222 rvec_inc(f[i], fAdd[i]);
226 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
227 tensor vir_part, t_graph *graph, matrix box,
228 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
232 /* The short-range virial from surrounding boxes */
234 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
237 /* Calculate partial virial, for local atoms only, based on short range.
238 * Total virial is computed in global_stat, called from do_md
240 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
243 /* Add position restraint contribution */
244 for (i = 0; i < DIM; i++)
246 vir_part[i][i] += fr->vir_diag_posres[i];
249 /* Add wall contribution */
250 for (i = 0; i < DIM; i++)
252 vir_part[i][ZZ] += fr->vir_wall_z[i];
257 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
261 static void pull_potential_wrapper(t_commrec *cr,
263 matrix box, rvec x[],
267 gmx_enerdata_t *enerd,
270 gmx_wallcycle_t wcycle)
275 /* Calculate the center of mass forces, this requires communication,
276 * which is why pull_potential is called close to other communication.
277 * The virial contribution is calculated directly,
278 * which is why we call pull_potential after calc_virial.
280 wallcycle_start(wcycle, ewcPULLPOT);
281 set_pbc(&pbc, ir->ePBC, box);
283 enerd->term[F_COM_PULL] +=
284 pull_potential(ir->pull_work, mdatoms, &pbc,
285 cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287 wallcycle_stop(wcycle, ewcPULLPOT);
290 static void pme_receive_force_ener(t_commrec *cr,
291 gmx_wallcycle_t wcycle,
292 gmx_enerdata_t *enerd,
295 real e_q, e_lj, dvdl_q, dvdl_lj;
296 float cycles_ppdpme, cycles_seppme;
298 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
301 /* In case of node-splitting, the PP nodes receive the long-range
302 * forces, virial and energy from the PME nodes here.
304 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
307 gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308 fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
310 enerd->term[F_COUL_RECIP] += e_q;
311 enerd->term[F_LJ_RECIP] += e_lj;
312 enerd->dvdl_lin[efptCOUL] += dvdl_q;
313 enerd->dvdl_lin[efptVDW] += dvdl_lj;
317 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
319 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323 gmx_int64_t step, real forceTolerance,
324 const rvec *x, const rvec *f)
326 real force2Tolerance = gmx::square(forceTolerance);
327 std::uintmax_t numNonFinite = 0;
328 for (int i = 0; i < md->homenr; i++)
330 real force2 = norm2(f[i]);
331 bool nonFinite = !std::isfinite(force2);
332 if (force2 >= force2Tolerance || nonFinite)
334 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
336 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
343 if (numNonFinite > 0)
345 /* Note that with MPI this fatal call on one rank might interrupt
346 * the printing on other ranks. But we can only avoid that with
347 * an expensive MPI barrier that we would need at each step.
349 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
353 static void post_process_forces(t_commrec *cr,
355 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
357 matrix box, rvec x[],
362 t_forcerec *fr, gmx_vsite_t *vsite,
369 /* Spread the mesh force on virtual sites to the other particles...
370 * This is parallellized. MPI communication is performed
371 * if the constructing atoms aren't local.
373 wallcycle_start(wcycle, ewcVSITESPREAD);
374 spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375 (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
377 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378 wallcycle_stop(wcycle, ewcVSITESPREAD);
380 if (flags & GMX_FORCE_VIRIAL)
382 /* Now add the forces, this is local */
383 sum_forces(f, fr->f_novirsum);
385 if (EEL_FULL(fr->eeltype))
387 /* Add the mesh contribution to the virial */
388 m_add(vir_force, fr->vir_el_recip, vir_force);
390 if (EVDW_PME(fr->vdwtype))
392 /* Add the mesh contribution to the virial */
393 m_add(vir_force, fr->vir_lj_recip, vir_force);
397 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
402 if (fr->print_force >= 0)
404 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
408 static void do_nb_verlet(t_forcerec *fr,
409 interaction_const_t *ic,
410 gmx_enerdata_t *enerd,
411 int flags, int ilocality,
415 gmx_wallcycle_t wcycle)
417 if (!(flags & GMX_FORCE_NONBONDED))
419 /* skip non-bonded calculation */
423 nonbonded_verlet_t *nbv = fr->nbv;
424 nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
426 /* GPU kernel launch overhead is already timed separately */
427 if (fr->cutoff_scheme != ecutsVERLET)
429 gmx_incons("Invalid cut-off scheme passed!");
432 bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
434 if (!bUsingGpuKernels)
436 /* When dynamic pair-list pruning is requested, we need to prune
437 * at nstlistPrune steps.
439 if (nbv->listParams->useDynamicPruning &&
440 (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
442 /* Prune the pair-list beyond fr->ic->rlistPrune using
443 * the current coordinates of the atoms.
445 wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
446 GMX_RELEASE_ASSERT(false, "The CPU prune kernel will be called here");
447 wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
450 wallcycle_sub_start(wcycle, ewcsNONBONDED);
453 switch (nbvg->kernel_type)
455 case nbnxnk4x4_PlainC:
456 case nbnxnk4xN_SIMD_4xN:
457 case nbnxnk4xN_SIMD_2xNN:
458 nbnxn_kernel_cpu(nbvg,
464 enerd->grpp.ener[egCOULSR],
466 enerd->grpp.ener[egBHAMSR] :
467 enerd->grpp.ener[egLJSR]);
470 case nbnxnk8x8x8_GPU:
471 nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
474 case nbnxnk8x8x8_PlainC:
475 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
480 nbvg->nbat->out[0].f,
482 enerd->grpp.ener[egCOULSR],
484 enerd->grpp.ener[egBHAMSR] :
485 enerd->grpp.ener[egLJSR]);
489 GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
492 if (!bUsingGpuKernels)
494 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
497 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
498 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
500 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
502 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
503 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
505 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
509 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
511 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
512 if (flags & GMX_FORCE_ENERGY)
514 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
515 enr_nbnxn_kernel_ljc += 1;
516 enr_nbnxn_kernel_lj += 1;
519 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
520 nbvg->nbl_lists.natpair_ljq);
521 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
522 nbvg->nbl_lists.natpair_lj);
523 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
524 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
525 nbvg->nbl_lists.natpair_q);
527 if (ic->vdw_modifier == eintmodFORCESWITCH)
529 /* We add up the switch cost separately */
530 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
531 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
533 if (ic->vdw_modifier == eintmodPOTSWITCH)
535 /* We add up the switch cost separately */
536 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
537 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
539 if (ic->vdwtype == evdwPME)
541 /* We add up the LJ Ewald cost separately */
542 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
543 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
547 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
554 gmx_enerdata_t *enerd,
557 gmx_wallcycle_t wcycle)
560 nb_kernel_data_t kernel_data;
562 real dvdl_nb[efptNR];
567 /* Add short-range interactions */
568 donb_flags |= GMX_NONBONDED_DO_SR;
570 /* Currently all group scheme kernels always calculate (shift-)forces */
571 if (flags & GMX_FORCE_FORCES)
573 donb_flags |= GMX_NONBONDED_DO_FORCE;
575 if (flags & GMX_FORCE_VIRIAL)
577 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
579 if (flags & GMX_FORCE_ENERGY)
581 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
584 kernel_data.flags = donb_flags;
585 kernel_data.lambda = lambda;
586 kernel_data.dvdl = dvdl_nb;
588 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
589 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
591 /* reset free energy components */
592 for (i = 0; i < efptNR; i++)
597 assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
599 wallcycle_sub_start(wcycle, ewcsNONBONDED);
600 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
601 for (th = 0; th < nbl_lists->nnbl; th++)
605 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
606 x, f, fr, mdatoms, &kernel_data, nrnb);
608 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
611 if (fepvals->sc_alpha != 0)
613 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
614 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
618 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
619 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
622 /* If we do foreign lambda and we have soft-core interactions
623 * we have to recalculate the (non-linear) energies contributions.
625 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
627 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
628 kernel_data.lambda = lam_i;
629 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
630 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
631 /* Note that we add to kernel_data.dvdl, but ignore the result */
633 for (i = 0; i < enerd->n_lambda; i++)
635 for (j = 0; j < efptNR; j++)
637 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
639 reset_foreign_enerdata(enerd);
640 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
641 for (th = 0; th < nbl_lists->nnbl; th++)
645 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
646 x, f, fr, mdatoms, &kernel_data, nrnb);
648 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
651 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
652 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
656 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
659 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
661 return nbv != nullptr && nbv->bUseGPU;
664 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
666 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
668 /* Note that we would like to avoid this conditional by putting it
669 * into the omp pragma instead, but then we still take the full
670 * omp parallel for overhead (at least with gcc5).
674 for (int i = 0; i < n; i++)
681 #pragma omp parallel for num_threads(nth) schedule(static)
682 for (int i = 0; i < n; i++)
689 /*! \brief This routine checks if the potential energy is finite.
691 * Note that passing this check does not guarantee finite forces,
692 * since those use slightly different arithmetics. But in most cases
693 * there is just a narrow coordinate range where forces are not finite
694 * and energies are finite.
696 * \param[in] enerd The energy data; the non-bonded group energies need to be added in here before calling this routine
698 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
700 if (!std::isfinite(enerd->term[F_EPOT]))
702 gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
705 enerd->term[F_COUL_SR]);
709 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
710 t_inputrec *inputrec,
711 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
713 gmx_groups_t gmx_unused *groups,
714 matrix box, rvec x[], history_t *hist,
715 PaddedRVecVector *force,
718 gmx_enerdata_t *enerd, t_fcdata *fcd,
719 real *lambda, t_graph *graph,
720 t_forcerec *fr, interaction_const_t *ic,
721 gmx_vsite_t *vsite, rvec mu_tot,
722 double t, gmx_edsam_t ed,
725 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
726 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
730 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
731 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
732 gmx_bool bDiffKernels = FALSE;
733 rvec vzero, box_diag;
734 float cycles_pme, cycles_wait_gpu;
735 nonbonded_verlet_t *nbv = fr->nbv;
737 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
738 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
739 bFillGrid = (bNS && bStateChanged);
740 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
741 bDoForces = (flags & GMX_FORCE_FORCES);
742 bUseGPU = fr->nbv->bUseGPU;
743 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
745 /* At a search step we need to start the first balancing region
746 * somewhere early inside the step after communication during domain
747 * decomposition (and not during the previous step as usual).
750 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
752 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
758 const int homenr = mdatoms->homenr;
760 clear_mat(vir_force);
762 if (DOMAINDECOMP(cr))
764 cg1 = cr->dd->ncg_tot;
777 update_forcerec(fr, box);
779 if (inputrecNeedMutot(inputrec))
781 /* Calculate total (local) dipole moment in a temporary common array.
782 * This makes it possible to sum them over nodes faster.
784 calc_mu(start, homenr,
785 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
790 if (fr->ePBC != epbcNONE)
792 /* Compute shift vectors every step,
793 * because of pressure coupling or box deformation!
795 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
797 calc_shifts(box, fr->shift_vec);
802 put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
803 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
805 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
807 unshift_self(graph, box, x);
811 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
812 fr->shift_vec, nbv->grp[0].nbat);
815 if (!(cr->duty & DUTY_PME))
820 /* Send particle coordinates to the pme nodes.
821 * Since this is only implemented for domain decomposition
822 * and domain decomposition does not use the graph,
823 * we do not need to worry about shifting.
826 wallcycle_start(wcycle, ewcPP_PMESENDX);
828 bBS = (inputrec->nwall == 2);
832 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
835 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
836 lambda[efptCOUL], lambda[efptVDW],
837 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
840 wallcycle_stop(wcycle, ewcPP_PMESENDX);
844 /* do gridding for pair search */
847 if (graph && bStateChanged)
849 /* Calculate intramolecular shift vectors to make molecules whole */
850 mk_mshift(fplog, graph, fr->ePBC, box, x);
854 box_diag[XX] = box[XX][XX];
855 box_diag[YY] = box[YY][YY];
856 box_diag[ZZ] = box[ZZ][ZZ];
858 wallcycle_start(wcycle, ewcNS);
861 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
862 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
864 0, mdatoms->homenr, -1, fr->cginfo, x,
866 nbv->grp[eintLocal].kernel_type,
867 nbv->grp[eintLocal].nbat);
868 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
872 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
873 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
875 nbv->grp[eintNonlocal].kernel_type,
876 nbv->grp[eintNonlocal].nbat);
877 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
880 if (nbv->ngrp == 1 ||
881 nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
883 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
884 nbv->nbs, mdatoms, fr->cginfo);
888 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
889 nbv->nbs, mdatoms, fr->cginfo);
890 nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
891 nbv->nbs, mdatoms, fr->cginfo);
893 wallcycle_stop(wcycle, ewcNS);
896 /* initialize the GPU atom data and copy shift vector */
901 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
902 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
903 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
906 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
907 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
908 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
911 /* do local pair search */
914 wallcycle_start_nocount(wcycle, ewcNS);
915 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
916 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
918 nbv->listParams->rlistOuter,
919 nbv->min_ci_balanced,
920 &nbv->grp[eintLocal].nbl_lists,
922 nbv->grp[eintLocal].kernel_type,
924 nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
925 if (nbv->listParams->useDynamicPruning && !bUseGPU)
927 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
929 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
933 /* initialize local pair-list on the GPU */
934 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
935 nbv->grp[eintLocal].nbl_lists.nbl[0],
938 wallcycle_stop(wcycle, ewcNS);
942 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
943 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
944 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
945 nbv->grp[eintLocal].nbat);
946 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
947 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
952 if (DOMAINDECOMP(cr))
954 ddOpenBalanceRegionGpu(cr->dd);
957 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
958 /* launch local nonbonded F on GPU */
959 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
961 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
964 /* Communicate coordinates and sum dipole if necessary +
965 do non-local pair search */
966 if (DOMAINDECOMP(cr))
968 bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
969 nbv->grp[eintLocal].kernel_type);
973 /* With GPU+CPU non-bonded calculations we need to copy
974 * the local coordinates to the non-local nbat struct
975 * (in CPU format) as the non-local kernel call also
976 * calculates the local - non-local interactions.
978 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
979 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
980 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
981 nbv->grp[eintNonlocal].nbat);
982 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
983 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
988 wallcycle_start_nocount(wcycle, ewcNS);
989 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
993 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
996 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
998 nbv->listParams->rlistOuter,
999 nbv->min_ci_balanced,
1000 &nbv->grp[eintNonlocal].nbl_lists,
1002 nbv->grp[eintNonlocal].kernel_type,
1004 nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
1005 if (nbv->listParams->useDynamicPruning && !bUseGPU)
1007 nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
1009 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
1011 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
1013 /* initialize non-local pair-list on the GPU */
1014 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
1015 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
1018 wallcycle_stop(wcycle, ewcNS);
1022 wallcycle_start(wcycle, ewcMOVEX);
1023 dd_move_x(cr->dd, box, x);
1024 wallcycle_stop(wcycle, ewcMOVEX);
1026 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1027 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1028 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1029 nbv->grp[eintNonlocal].nbat);
1030 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1031 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1034 if (bUseGPU && !bDiffKernels)
1036 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1037 /* launch non-local nonbonded F on GPU */
1038 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1039 step, nrnb, wcycle);
1040 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1046 /* launch D2H copy-back F */
1047 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1048 if (DOMAINDECOMP(cr) && !bDiffKernels)
1050 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1051 flags, eatNonlocal);
1053 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1055 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1058 if (bStateChanged && inputrecNeedMutot(inputrec))
1062 gmx_sumd(2*DIM, mu, cr);
1063 ddReopenBalanceRegionCpu(cr->dd);
1066 for (i = 0; i < 2; i++)
1068 for (j = 0; j < DIM; j++)
1070 fr->mu_tot[i][j] = mu[i*DIM + j];
1074 if (fr->efep == efepNO)
1076 copy_rvec(fr->mu_tot[0], mu_tot);
1080 for (j = 0; j < DIM; j++)
1083 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1084 lambda[efptCOUL]*fr->mu_tot[1][j];
1088 /* Reset energies */
1089 reset_enerdata(enerd);
1090 clear_rvecs(SHIFTS, fr->fshift);
1092 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1094 wallcycle_start(wcycle, ewcPPDURINGPME);
1095 dd_force_flop_start(cr->dd, nrnb);
1100 wallcycle_start(wcycle, ewcROT);
1101 do_rotation(cr, inputrec, box, x, t, step, bNS);
1102 wallcycle_stop(wcycle, ewcROT);
1105 /* Temporary solution until all routines take PaddedRVecVector */
1106 rvec *f = as_rvec_array(force->data());
1108 /* Start the force cycle counter.
1109 * Note that a different counter is used for dynamic load balancing.
1111 wallcycle_start(wcycle, ewcFORCE);
1114 /* Reset forces for which the virial is calculated separately:
1115 * PME/Ewald forces if necessary */
1116 if (fr->bF_NoVirSum)
1118 if (flags & GMX_FORCE_VIRIAL)
1120 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1124 /* We are not calculating the pressure so we do not need
1125 * a separate array for forces that do not contribute
1128 fr->f_novirsum = force;
1132 if (fr->bF_NoVirSum)
1134 if (flags & GMX_FORCE_VIRIAL)
1136 /* TODO: remove this - 1 when padding is properly implemented */
1137 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1138 as_rvec_array(fr->f_novirsum->data()));
1141 /* Clear the short- and long-range forces */
1142 clear_rvecs_omp(fr->natoms_force_constr, f);
1144 clear_rvec(fr->vir_diag_posres);
1147 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1149 clear_pull_forces(inputrec->pull_work);
1152 /* We calculate the non-bonded forces, when done on the CPU, here.
1153 * We do this before calling do_force_lowlevel, because in that
1154 * function, the listed forces are calculated before PME, which
1155 * does communication. With this order, non-bonded and listed
1156 * force calculation imbalance can be balanced out by the domain
1157 * decomposition load balancing.
1162 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1163 step, nrnb, wcycle);
1166 if (fr->efep != efepNO)
1168 /* Calculate the local and non-local free energy interactions here.
1169 * Happens here on the CPU both with and without GPU.
1171 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1173 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1175 inputrec->fepvals, lambda,
1176 enerd, flags, nrnb, wcycle);
1179 if (DOMAINDECOMP(cr) &&
1180 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1182 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1184 inputrec->fepvals, lambda,
1185 enerd, flags, nrnb, wcycle);
1189 if (!bUseOrEmulGPU || bDiffKernels)
1193 if (DOMAINDECOMP(cr))
1195 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1196 bDiffKernels ? enbvClearFYes : enbvClearFNo,
1197 step, nrnb, wcycle);
1206 aloc = eintNonlocal;
1209 /* Add all the non-bonded force to the normal force array.
1210 * This can be split into a local and a non-local part when overlapping
1211 * communication with calculation with domain decomposition.
1213 wallcycle_stop(wcycle, ewcFORCE);
1214 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1215 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1216 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1217 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1218 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1219 wallcycle_start_nocount(wcycle, ewcFORCE);
1221 /* if there are multiple fshift output buffers reduce them */
1222 if ((flags & GMX_FORCE_VIRIAL) &&
1223 nbv->grp[aloc].nbl_lists.nnbl > 1)
1225 /* This is not in a subcounter because it takes a
1226 negligible and constant-sized amount of time */
1227 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1232 /* update QMMMrec, if necessary */
1235 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1238 /* Compute the bonded and non-bonded energies and optionally forces */
1239 do_force_lowlevel(fr, inputrec, &(top->idef),
1240 cr, nrnb, wcycle, mdatoms,
1241 x, hist, f, enerd, fcd, top, fr->born,
1243 inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1244 flags, &cycles_pme);
1246 wallcycle_stop(wcycle, ewcFORCE);
1250 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1253 if (bUseOrEmulGPU && !bDiffKernels)
1255 /* wait for non-local forces (or calculate in emulation mode) */
1256 if (DOMAINDECOMP(cr))
1260 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1261 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1263 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1265 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1269 wallcycle_start_nocount(wcycle, ewcFORCE);
1270 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1271 step, nrnb, wcycle);
1272 wallcycle_stop(wcycle, ewcFORCE);
1274 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1275 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1276 /* skip the reduction if there was no non-local work to do */
1277 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1279 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1280 nbv->grp[eintNonlocal].nbat, f);
1282 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1283 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1287 if (DOMAINDECOMP(cr))
1289 /* We are done with the CPU compute.
1290 * We will now communicate the non-local forces.
1291 * If we use a GPU this will overlap with GPU work, so in that case
1292 * we do not close the DD force balancing region here.
1294 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1296 ddCloseBalanceRegionCpu(cr->dd);
1300 wallcycle_start(wcycle, ewcMOVEF);
1301 dd_move_f(cr->dd, f, fr->fshift);
1302 wallcycle_stop(wcycle, ewcMOVEF);
1308 /* wait for local forces (or calculate in emulation mode) */
1311 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1312 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1313 * but even with a step of 0.1 ms the difference is less than 1%
1316 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1318 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1319 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1321 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1323 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1325 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1327 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1328 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1330 /* We measured few cycles, it could be that the kernel
1331 * and transfer finished earlier and there was no actual
1332 * wait time, only API call overhead.
1333 * Then the actual time could be anywhere between 0 and
1334 * cycles_wait_est. We will use half of cycles_wait_est.
1336 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1338 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1341 /* now clear the GPU outputs while we finish the step on the CPU */
1342 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1343 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1345 /* Is dynamic pair-list pruning activated? */
1346 if (nbv->listParams->useDynamicPruning)
1348 /* We should not launch the rolling pruning kernel at a search
1349 * step or just before search steps, since that's useless.
1350 * Without domain decomposition we prune at even steps.
1351 * With domain decomposition we alternate local and non-local
1352 * pruning at even and odd steps.
1354 int numRollingParts = nbv->listParams->numRollingParts;
1355 GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
1356 int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
1357 bool stepIsEven = ((stepWithCurrentList & 1) == 0);
1358 if (stepWithCurrentList > 0 &&
1359 stepWithCurrentList < inputrec->nstlist - 1 &&
1360 (stepIsEven || DOMAINDECOMP(cr)))
1362 GMX_RELEASE_ASSERT(false, "The GPU prune kernel will be called here");
1365 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1369 wallcycle_start_nocount(wcycle, ewcFORCE);
1370 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1371 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1372 step, nrnb, wcycle);
1373 wallcycle_stop(wcycle, ewcFORCE);
1375 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1376 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1377 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1378 nbv->grp[eintLocal].nbat, f);
1379 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1380 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1383 if (DOMAINDECOMP(cr))
1385 dd_force_flop_stop(cr->dd, nrnb);
1390 /* Collect forces from modules */
1391 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1392 if (fr->bF_NoVirSum)
1394 fNoVirSum = *fr->f_novirsum;
1396 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1398 /* If we have NoVirSum forces, but we do not calculate the virial,
1399 * we sum fr->f_novirsum=f later.
1401 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1403 wallcycle_start(wcycle, ewcVSITESPREAD);
1404 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1405 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1406 wallcycle_stop(wcycle, ewcVSITESPREAD);
1409 if (flags & GMX_FORCE_VIRIAL)
1411 /* Calculation of the virial must be done after vsites! */
1412 calc_virial(0, mdatoms->homenr, x, f,
1413 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1417 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1419 /* Since the COM pulling is always done mass-weighted, no forces are
1420 * applied to vsites and this call can be done after vsite spreading.
1422 pull_potential_wrapper(cr, inputrec, box, x,
1423 f, vir_force, mdatoms, enerd, lambda, t,
1427 /* Add the forces from enforced rotation potentials (if any) */
1430 wallcycle_start(wcycle, ewcROTadd);
1431 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1432 wallcycle_stop(wcycle, ewcROTadd);
1435 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1436 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1438 if (PAR(cr) && !(cr->duty & DUTY_PME))
1440 /* In case of node-splitting, the PP nodes receive the long-range
1441 * forces, virial and energy from the PME nodes here.
1443 pme_receive_force_ener(cr, wcycle, enerd, fr);
1448 post_process_forces(cr, step, nrnb, wcycle,
1449 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1453 if (flags & GMX_FORCE_ENERGY)
1455 /* Sum the potential energy terms from group contributions */
1456 sum_epot(&(enerd->grpp), enerd->term);
1458 if (!EI_TPI(inputrec->eI))
1460 checkPotentialEnergyValidity(enerd);
1465 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1466 t_inputrec *inputrec,
1467 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1468 gmx_localtop_t *top,
1469 gmx_groups_t *groups,
1470 matrix box, rvec x[], history_t *hist,
1471 PaddedRVecVector *force,
1474 gmx_enerdata_t *enerd, t_fcdata *fcd,
1475 real *lambda, t_graph *graph,
1476 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1477 double t, gmx_edsam_t ed,
1478 gmx_bool bBornRadii,
1480 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1481 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1485 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1489 const int start = 0;
1490 const int homenr = mdatoms->homenr;
1492 clear_mat(vir_force);
1495 if (DOMAINDECOMP(cr))
1497 cg1 = cr->dd->ncg_tot;
1508 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1509 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1510 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1511 bFillGrid = (bNS && bStateChanged);
1512 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1513 bDoForces = (flags & GMX_FORCE_FORCES);
1517 update_forcerec(fr, box);
1519 if (inputrecNeedMutot(inputrec))
1521 /* Calculate total (local) dipole moment in a temporary common array.
1522 * This makes it possible to sum them over nodes faster.
1524 calc_mu(start, homenr,
1525 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1530 if (fr->ePBC != epbcNONE)
1532 /* Compute shift vectors every step,
1533 * because of pressure coupling or box deformation!
1535 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1537 calc_shifts(box, fr->shift_vec);
1542 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1543 &(top->cgs), x, fr->cg_cm);
1544 inc_nrnb(nrnb, eNR_CGCM, homenr);
1545 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1547 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1549 unshift_self(graph, box, x);
1554 calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1555 inc_nrnb(nrnb, eNR_CGCM, homenr);
1558 if (bCalcCGCM && gmx_debug_at)
1560 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1564 if (!(cr->duty & DUTY_PME))
1569 /* Send particle coordinates to the pme nodes.
1570 * Since this is only implemented for domain decomposition
1571 * and domain decomposition does not use the graph,
1572 * we do not need to worry about shifting.
1575 wallcycle_start(wcycle, ewcPP_PMESENDX);
1577 bBS = (inputrec->nwall == 2);
1580 copy_mat(box, boxs);
1581 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1584 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1585 lambda[efptCOUL], lambda[efptVDW],
1586 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1589 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1591 #endif /* GMX_MPI */
1593 /* Communicate coordinates and sum dipole if necessary */
1594 if (DOMAINDECOMP(cr))
1596 wallcycle_start(wcycle, ewcMOVEX);
1597 dd_move_x(cr->dd, box, x);
1598 wallcycle_stop(wcycle, ewcMOVEX);
1599 /* No GPU support, no move_x overlap, so reopen the balance region here */
1600 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1602 ddReopenBalanceRegionCpu(cr->dd);
1606 if (inputrecNeedMutot(inputrec))
1612 gmx_sumd(2*DIM, mu, cr);
1613 ddReopenBalanceRegionCpu(cr->dd);
1615 for (i = 0; i < 2; i++)
1617 for (j = 0; j < DIM; j++)
1619 fr->mu_tot[i][j] = mu[i*DIM + j];
1623 if (fr->efep == efepNO)
1625 copy_rvec(fr->mu_tot[0], mu_tot);
1629 for (j = 0; j < DIM; j++)
1632 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1637 /* Reset energies */
1638 reset_enerdata(enerd);
1639 clear_rvecs(SHIFTS, fr->fshift);
1643 wallcycle_start(wcycle, ewcNS);
1645 if (graph && bStateChanged)
1647 /* Calculate intramolecular shift vectors to make molecules whole */
1648 mk_mshift(fplog, graph, fr->ePBC, box, x);
1651 /* Do the actual neighbour searching */
1653 groups, top, mdatoms,
1654 cr, nrnb, bFillGrid);
1656 wallcycle_stop(wcycle, ewcNS);
1659 if (inputrec->implicit_solvent && bNS)
1661 make_gb_nblist(cr, inputrec->gb_algorithm,
1662 x, box, fr, &top->idef, graph, fr->born);
1665 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1667 wallcycle_start(wcycle, ewcPPDURINGPME);
1668 dd_force_flop_start(cr->dd, nrnb);
1673 wallcycle_start(wcycle, ewcROT);
1674 do_rotation(cr, inputrec, box, x, t, step, bNS);
1675 wallcycle_stop(wcycle, ewcROT);
1678 /* Temporary solution until all routines take PaddedRVecVector */
1679 rvec *f = as_rvec_array(force->data());
1681 /* Start the force cycle counter.
1682 * Note that a different counter is used for dynamic load balancing.
1684 wallcycle_start(wcycle, ewcFORCE);
1688 /* Reset forces for which the virial is calculated separately:
1689 * PME/Ewald forces if necessary */
1690 if (fr->bF_NoVirSum)
1692 if (flags & GMX_FORCE_VIRIAL)
1694 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1695 /* TODO: remove this - 1 when padding is properly implemented */
1696 clear_rvecs(fr->f_novirsum->size() - 1,
1697 as_rvec_array(fr->f_novirsum->data()));
1701 /* We are not calculating the pressure so we do not need
1702 * a separate array for forces that do not contribute
1705 fr->f_novirsum = force;
1709 /* Clear the short- and long-range forces */
1710 clear_rvecs(fr->natoms_force_constr, f);
1712 clear_rvec(fr->vir_diag_posres);
1714 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1716 clear_pull_forces(inputrec->pull_work);
1719 /* update QMMMrec, if necessary */
1722 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1725 /* Compute the bonded and non-bonded energies and optionally forces */
1726 do_force_lowlevel(fr, inputrec, &(top->idef),
1727 cr, nrnb, wcycle, mdatoms,
1728 x, hist, f, enerd, fcd, top, fr->born,
1730 inputrec->fepvals, lambda,
1731 graph, &(top->excls), fr->mu_tot,
1735 wallcycle_stop(wcycle, ewcFORCE);
1737 if (DOMAINDECOMP(cr))
1739 dd_force_flop_stop(cr->dd, nrnb);
1741 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1743 ddCloseBalanceRegionCpu(cr->dd);
1749 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1754 /* Collect forces from modules */
1755 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1756 if (fr->bF_NoVirSum)
1758 fNoVirSum = *fr->f_novirsum;
1760 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1762 /* Communicate the forces */
1763 if (DOMAINDECOMP(cr))
1765 wallcycle_start(wcycle, ewcMOVEF);
1766 dd_move_f(cr->dd, f, fr->fshift);
1767 /* Do we need to communicate the separate force array
1768 * for terms that do not contribute to the single sum virial?
1769 * Position restraints and electric fields do not introduce
1770 * inter-cg forces, only full electrostatics methods do.
1771 * When we do not calculate the virial, fr->f_novirsum = f,
1772 * so we have already communicated these forces.
1774 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1775 (flags & GMX_FORCE_VIRIAL))
1777 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1779 wallcycle_stop(wcycle, ewcMOVEF);
1782 /* If we have NoVirSum forces, but we do not calculate the virial,
1783 * we sum fr->f_novirsum=f later.
1785 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1787 wallcycle_start(wcycle, ewcVSITESPREAD);
1788 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1789 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1790 wallcycle_stop(wcycle, ewcVSITESPREAD);
1793 if (flags & GMX_FORCE_VIRIAL)
1795 /* Calculation of the virial must be done after vsites! */
1796 calc_virial(0, mdatoms->homenr, x, f,
1797 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1801 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1803 pull_potential_wrapper(cr, inputrec, box, x,
1804 f, vir_force, mdatoms, enerd, lambda, t,
1808 /* Add the forces from enforced rotation potentials (if any) */
1811 wallcycle_start(wcycle, ewcROTadd);
1812 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1813 wallcycle_stop(wcycle, ewcROTadd);
1816 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1817 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1819 if (PAR(cr) && !(cr->duty & DUTY_PME))
1821 /* In case of node-splitting, the PP nodes receive the long-range
1822 * forces, virial and energy from the PME nodes here.
1824 pme_receive_force_ener(cr, wcycle, enerd, fr);
1829 post_process_forces(cr, step, nrnb, wcycle,
1830 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1834 if (flags & GMX_FORCE_ENERGY)
1836 /* Sum the potential energy terms from group contributions */
1837 sum_epot(&(enerd->grpp), enerd->term);
1839 if (!EI_TPI(inputrec->eI))
1841 checkPotentialEnergyValidity(enerd);
1847 void do_force(FILE *fplog, t_commrec *cr,
1848 t_inputrec *inputrec,
1849 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1850 gmx_localtop_t *top,
1851 gmx_groups_t *groups,
1852 matrix box, PaddedRVecVector *coordinates, history_t *hist,
1853 PaddedRVecVector *force,
1856 gmx_enerdata_t *enerd, t_fcdata *fcd,
1857 gmx::ArrayRef<real> lambda, t_graph *graph,
1859 gmx_vsite_t *vsite, rvec mu_tot,
1860 double t, gmx_edsam_t ed,
1861 gmx_bool bBornRadii,
1863 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1864 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1866 /* modify force flag if not doing nonbonded */
1867 if (!fr->bNonbonded)
1869 flags &= ~GMX_FORCE_NONBONDED;
1872 GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1873 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1874 GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1875 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1877 rvec *x = as_rvec_array(coordinates->data());
1879 switch (inputrec->cutoff_scheme)
1882 do_force_cutsVERLET(fplog, cr, inputrec,
1890 lambda.data(), graph,
1896 ddOpenBalanceRegion,
1897 ddCloseBalanceRegion);
1900 do_force_cutsGROUP(fplog, cr, inputrec,
1908 lambda.data(), graph,
1913 ddOpenBalanceRegion,
1914 ddCloseBalanceRegion);
1917 gmx_incons("Invalid cut-off scheme passed!");
1920 /* In case we don't have constraints and are using GPUs, the next balancing
1921 * region starts here.
1922 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1923 * virial calculation and COM pulling, is not thus not included in
1924 * the balance timing, which is ok as most tasks do communication.
1926 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1928 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1933 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1934 t_inputrec *ir, t_mdatoms *md,
1935 t_state *state, t_commrec *cr, t_nrnb *nrnb,
1936 t_forcerec *fr, gmx_localtop_t *top)
1938 int i, m, start, end;
1940 real dt = ir->delta_t;
1944 /* We need to allocate one element extra, since we might use
1945 * (unaligned) 4-wide SIMD loads to access rvec entries.
1947 snew(savex, state->natoms + 1);
1954 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1955 start, md->homenr, end);
1957 /* Do a first constrain to reset particles... */
1958 step = ir->init_step;
1961 char buf[STEPSTRSIZE];
1962 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1963 gmx_step_str(step, buf));
1967 /* constrain the current position */
1968 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1969 ir, cr, step, 0, 1.0, md,
1970 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1971 fr->bMolPBC, state->box,
1972 state->lambda[efptBONDED], &dvdl_dum,
1973 nullptr, nullptr, nrnb, econqCoord);
1976 /* constrain the inital velocity, and save it */
1977 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1978 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1979 ir, cr, step, 0, 1.0, md,
1980 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1981 fr->bMolPBC, state->box,
1982 state->lambda[efptBONDED], &dvdl_dum,
1983 nullptr, nullptr, nrnb, econqVeloc);
1985 /* constrain the inital velocities at t-dt/2 */
1986 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1988 for (i = start; (i < end); i++)
1990 for (m = 0; (m < DIM); m++)
1992 /* Reverse the velocity */
1993 state->v[i][m] = -state->v[i][m];
1994 /* Store the position at t-dt in buf */
1995 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1998 /* Shake the positions at t=-dt with the positions at t=0
1999 * as reference coordinates.
2003 char buf[STEPSTRSIZE];
2004 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
2005 gmx_step_str(step, buf));
2008 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
2009 ir, cr, step, -1, 1.0, md,
2010 as_rvec_array(state->x.data()), savex, nullptr,
2011 fr->bMolPBC, state->box,
2012 state->lambda[efptBONDED], &dvdl_dum,
2013 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
2015 for (i = start; i < end; i++)
2017 for (m = 0; m < DIM; m++)
2019 /* Re-reverse the velocities */
2020 state->v[i][m] = -state->v[i][m];
2029 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
2030 double *enerout, double *virout)
2032 double enersum, virsum;
2033 double invscale, invscale2, invscale3;
2034 double r, ea, eb, ec, pa, pb, pc, pd;
2039 invscale = 1.0/scale;
2040 invscale2 = invscale*invscale;
2041 invscale3 = invscale*invscale2;
2043 /* Following summation derived from cubic spline definition,
2044 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2045 * the cubic spline. We first calculate the negative of the
2046 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2047 * add the more standard, abrupt cutoff correction to that result,
2048 * yielding the long-range correction for a switched function. We
2049 * perform both the pressure and energy loops at the same time for
2050 * simplicity, as the computational cost is low. */
2054 /* Since the dispersion table has been scaled down a factor
2055 * 6.0 and the repulsion a factor 12.0 to compensate for the
2056 * c6/c12 parameters inside nbfp[] being scaled up (to save
2057 * flops in kernels), we need to correct for this.
2068 for (ri = rstart; ri < rend; ++ri)
2072 eb = 2.0*invscale2*r;
2076 pb = 3.0*invscale2*r;
2077 pc = 3.0*invscale*r*r;
2080 /* this "8" is from the packing in the vdwtab array - perhaps
2081 should be defined? */
2083 offset = 8*ri + offstart;
2084 y0 = vdwtab[offset];
2085 f = vdwtab[offset+1];
2086 g = vdwtab[offset+2];
2087 h = vdwtab[offset+3];
2089 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2090 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2092 *enerout = 4.0*M_PI*enersum*tabfactor;
2093 *virout = 4.0*M_PI*virsum*tabfactor;
2096 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2098 double eners[2], virs[2], enersum, virsum;
2099 double r0, rc3, rc9;
2101 real scale, *vdwtab;
2103 fr->enershiftsix = 0;
2104 fr->enershifttwelve = 0;
2105 fr->enerdiffsix = 0;
2106 fr->enerdifftwelve = 0;
2108 fr->virdifftwelve = 0;
2110 if (eDispCorr != edispcNO)
2112 for (i = 0; i < 2; i++)
2117 if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2118 (fr->vdw_modifier == eintmodPOTSWITCH) ||
2119 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2120 (fr->vdwtype == evdwSHIFT) ||
2121 (fr->vdwtype == evdwSWITCH))
2123 if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2124 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2125 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2128 "With dispersion correction rvdw-switch can not be zero "
2129 "for vdw-type = %s", evdw_names[fr->vdwtype]);
2132 /* TODO This code depends on the logic in tables.c that
2133 constructs the table layout, which should be made
2134 explicit in future cleanup. */
2135 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2136 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2137 scale = fr->dispersionCorrectionTable->scale;
2138 vdwtab = fr->dispersionCorrectionTable->data;
2140 /* Round the cut-offs to exact table values for precision */
2141 ri0 = static_cast<int>(floor(fr->rvdw_switch*scale));
2142 ri1 = static_cast<int>(ceil(fr->rvdw*scale));
2144 /* The code below has some support for handling force-switching, i.e.
2145 * when the force (instead of potential) is switched over a limited
2146 * region. This leads to a constant shift in the potential inside the
2147 * switching region, which we can handle by adding a constant energy
2148 * term in the force-switch case just like when we do potential-shift.
2150 * For now this is not enabled, but to keep the functionality in the
2151 * code we check separately for switch and shift. When we do force-switch
2152 * the shifting point is rvdw_switch, while it is the cutoff when we
2153 * have a classical potential-shift.
2155 * For a pure potential-shift the potential has a constant shift
2156 * all the way out to the cutoff, and that is it. For other forms
2157 * we need to calculate the constant shift up to the point where we
2158 * start modifying the potential.
2160 ri0 = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2166 if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2167 (fr->vdwtype == evdwSHIFT))
2169 /* Determine the constant energy shift below rvdw_switch.
2170 * Table has a scale factor since we have scaled it down to compensate
2171 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2173 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2174 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2176 else if (fr->vdw_modifier == eintmodPOTSHIFT)
2178 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2179 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2182 /* Add the constant part from 0 to rvdw_switch.
2183 * This integration from 0 to rvdw_switch overcounts the number
2184 * of interactions by 1, as it also counts the self interaction.
2185 * We will correct for this later.
2187 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2188 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2190 /* Calculate the contribution in the range [r0,r1] where we
2191 * modify the potential. For a pure potential-shift modifier we will
2192 * have ri0==ri1, and there will not be any contribution here.
2194 for (i = 0; i < 2; i++)
2198 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2199 eners[i] -= enersum;
2203 /* Alright: Above we compensated by REMOVING the parts outside r0
2204 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2206 * Regardless of whether r0 is the point where we start switching,
2207 * or the cutoff where we calculated the constant shift, we include
2208 * all the parts we are missing out to infinity from r0 by
2209 * calculating the analytical dispersion correction.
2211 eners[0] += -4.0*M_PI/(3.0*rc3);
2212 eners[1] += 4.0*M_PI/(9.0*rc9);
2213 virs[0] += 8.0*M_PI/rc3;
2214 virs[1] += -16.0*M_PI/(3.0*rc9);
2216 else if (fr->vdwtype == evdwCUT ||
2217 EVDW_PME(fr->vdwtype) ||
2218 fr->vdwtype == evdwUSER)
2220 if (fr->vdwtype == evdwUSER && fplog)
2223 "WARNING: using dispersion correction with user tables\n");
2226 /* Note that with LJ-PME, the dispersion correction is multiplied
2227 * by the difference between the actual C6 and the value of C6
2228 * that would produce the combination rule.
2229 * This means the normal energy and virial difference formulas
2233 rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
2235 /* Contribution beyond the cut-off */
2236 eners[0] += -4.0*M_PI/(3.0*rc3);
2237 eners[1] += 4.0*M_PI/(9.0*rc9);
2238 if (fr->vdw_modifier == eintmodPOTSHIFT)
2240 /* Contribution within the cut-off */
2241 eners[0] += -4.0*M_PI/(3.0*rc3);
2242 eners[1] += 4.0*M_PI/(3.0*rc9);
2244 /* Contribution beyond the cut-off */
2245 virs[0] += 8.0*M_PI/rc3;
2246 virs[1] += -16.0*M_PI/(3.0*rc9);
2251 "Dispersion correction is not implemented for vdw-type = %s",
2252 evdw_names[fr->vdwtype]);
2255 /* When we deprecate the group kernels the code below can go too */
2256 if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2258 /* Calculate self-interaction coefficient (assuming that
2259 * the reciprocal-space contribution is constant in the
2260 * region that contributes to the self-interaction).
2262 fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2264 eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2265 virs[0] += gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2268 fr->enerdiffsix = eners[0];
2269 fr->enerdifftwelve = eners[1];
2270 /* The 0.5 is due to the Gromacs definition of the virial */
2271 fr->virdiffsix = 0.5*virs[0];
2272 fr->virdifftwelve = 0.5*virs[1];
2276 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2277 matrix box, real lambda, tensor pres, tensor virial,
2278 real *prescorr, real *enercorr, real *dvdlcorr)
2280 gmx_bool bCorrAll, bCorrPres;
2281 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2291 if (ir->eDispCorr != edispcNO)
2293 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2294 ir->eDispCorr == edispcAllEnerPres);
2295 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2296 ir->eDispCorr == edispcAllEnerPres);
2298 invvol = 1/det(box);
2301 /* Only correct for the interactions with the inserted molecule */
2302 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2307 dens = fr->numAtomsForDispersionCorrection*invvol;
2308 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2311 if (ir->efep == efepNO)
2313 avcsix = fr->avcsix[0];
2314 avctwelve = fr->avctwelve[0];
2318 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2319 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2322 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2323 *enercorr += avcsix*enerdiff;
2325 if (ir->efep != efepNO)
2327 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2331 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2332 *enercorr += avctwelve*enerdiff;
2333 if (fr->efep != efepNO)
2335 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2341 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2342 if (ir->eDispCorr == edispcAllEnerPres)
2344 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2346 /* The factor 2 is because of the Gromacs virial definition */
2347 spres = -2.0*invvol*svir*PRESFAC;
2349 for (m = 0; m < DIM; m++)
2351 virial[m][m] += svir;
2352 pres[m][m] += spres;
2357 /* Can't currently control when it prints, for now, just print when degugging */
2362 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2368 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2369 *enercorr, spres, svir);
2373 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2377 if (fr->efep != efepNO)
2379 *dvdlcorr += dvdlambda;
2384 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2385 const gmx_mtop_t *mtop, rvec x[],
2390 gmx_molblock_t *molb;
2392 if (bFirst && fplog)
2394 fprintf(fplog, "Removing pbc first time\n");
2399 for (mb = 0; mb < mtop->nmolblock; mb++)
2401 molb = &mtop->molblock[mb];
2402 if (molb->natoms_mol == 1 ||
2403 (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2405 /* Just one atom or charge group in the molecule, no PBC required */
2406 as += molb->nmol*molb->natoms_mol;
2410 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2411 mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2412 0, molb->natoms_mol, FALSE, FALSE, graph);
2414 for (mol = 0; mol < molb->nmol; mol++)
2416 mk_mshift(fplog, graph, ePBC, box, x+as);
2418 shift_self(graph, box, x+as);
2419 /* The molecule is whole now.
2420 * We don't need the second mk_mshift call as in do_pbc_first,
2421 * since we no longer need this graph.
2424 as += molb->natoms_mol;
2432 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2433 const gmx_mtop_t *mtop, rvec x[])
2435 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2438 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2439 gmx_mtop_t *mtop, rvec x[])
2441 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2444 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2447 nth = gmx_omp_nthreads_get(emntDefault);
2449 #pragma omp parallel for num_threads(nth) schedule(static)
2450 for (t = 0; t < nth; t++)
2456 offset = (natoms*t )/nth;
2457 len = (natoms*(t + 1))/nth - offset;
2458 put_atoms_in_box(ePBC, box, len, x + offset);
2460 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2464 // TODO This can be cleaned up a lot, and move back to runner.cpp
2465 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2466 const t_inputrec *inputrec,
2467 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2468 gmx_walltime_accounting_t walltime_accounting,
2469 nonbonded_verlet_t *nbv,
2470 gmx_bool bWriteStat)
2472 t_nrnb *nrnb_tot = nullptr;
2474 double nbfs = 0, mflop = 0;
2475 double elapsed_time,
2476 elapsed_time_over_all_ranks,
2477 elapsed_time_over_all_threads,
2478 elapsed_time_over_all_threads_over_all_ranks;
2479 /* Control whether it is valid to print a report. Only the
2480 simulation master may print, but it should not do so if the run
2481 terminated e.g. before a scheduled reset step. This is
2482 complicated by the fact that PME ranks are unaware of the
2483 reason why they were sent a pmerecvqxFINISH. To avoid
2484 communication deadlocks, we always do the communication for the
2485 report, even if we've decided not to write the report, because
2486 how long it takes to finish the run is not important when we've
2487 decided not to report on the simulation performance. */
2488 bool printReport = SIMMASTER(cr);
2490 if (!walltime_accounting_get_valid_finish(walltime_accounting))
2492 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2493 printReport = false;
2500 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2501 cr->mpi_comm_mysim);
2509 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2510 elapsed_time_over_all_ranks = elapsed_time;
2511 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2512 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2516 /* reduce elapsed_time over all MPI ranks in the current simulation */
2517 MPI_Allreduce(&elapsed_time,
2518 &elapsed_time_over_all_ranks,
2519 1, MPI_DOUBLE, MPI_SUM,
2520 cr->mpi_comm_mysim);
2521 elapsed_time_over_all_ranks /= cr->nnodes;
2522 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2523 * current simulation. */
2524 MPI_Allreduce(&elapsed_time_over_all_threads,
2525 &elapsed_time_over_all_threads_over_all_ranks,
2526 1, MPI_DOUBLE, MPI_SUM,
2527 cr->mpi_comm_mysim);
2533 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2540 if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2542 print_dd_statistics(cr, inputrec, fplog);
2545 /* TODO Move the responsibility for any scaling by thread counts
2546 * to the code that handled the thread region, so that there's a
2547 * mechanism to keep cycle counting working during the transition
2548 * to task parallelism. */
2549 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2550 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2551 wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2552 auto cycle_sum(wallcycle_sum(cr, wcycle));
2556 struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2558 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2559 elapsed_time_over_all_ranks,
2560 wcycle, cycle_sum, gputimes);
2562 if (EI_DYNAMICS(inputrec->eI))
2564 delta_t = inputrec->delta_t;
2569 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2570 elapsed_time_over_all_ranks,
2571 walltime_accounting_get_nsteps_done(walltime_accounting),
2572 delta_t, nbfs, mflop);
2576 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2577 elapsed_time_over_all_ranks,
2578 walltime_accounting_get_nsteps_done(walltime_accounting),
2579 delta_t, nbfs, mflop);
2584 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2586 /* this function works, but could probably use a logic rewrite to keep all the different
2587 types of efep straight. */
2589 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2594 t_lambda *fep = ir->fepvals;
2595 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2596 if checkpoint is set -- a kludge is in for now
2599 for (int i = 0; i < efptNR; i++)
2601 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2602 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2604 lambda[i] = fep->init_lambda;
2607 lam0[i] = lambda[i];
2612 lambda[i] = fep->all_lambda[i][*fep_state];
2615 lam0[i] = lambda[i];
2621 /* need to rescale control temperatures to match current state */
2622 for (int i = 0; i < ir->opts.ngtc; i++)
2624 if (ir->opts.ref_t[i] > 0)
2626 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2631 /* Send to the log the information on the current lambdas */
2632 if (fplog != nullptr)
2634 fprintf(fplog, "Initial vector of lambda components:[ ");
2635 for (const auto &l : lambda)
2637 fprintf(fplog, "%10.4f ", l);
2639 fprintf(fplog, "]\n");
2645 void init_md(FILE *fplog,
2646 t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2647 t_inputrec *ir, const gmx_output_env_t *oenv,
2648 double *t, double *t0,
2649 gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2650 t_nrnb *nrnb, gmx_mtop_t *mtop,
2652 int nfile, const t_filenm fnm[],
2653 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2654 tensor force_vir, tensor shake_vir, rvec mu_tot,
2655 gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2656 gmx_wallcycle_t wcycle)
2660 /* Initial values */
2661 *t = *t0 = ir->init_t;
2664 for (i = 0; i < ir->opts.ngtc; i++)
2666 /* set bSimAnn if any group is being annealed */
2667 if (ir->opts.annealing[i] != eannNO)
2673 /* Initialize lambda variables */
2674 initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2676 // TODO upd is never NULL in practice, but the analysers don't know that
2679 *upd = init_update(ir);
2683 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2688 *vcm = init_vcm(fplog, &mtop->groups, ir);
2691 if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2693 if (ir->etc == etcBERENDSEN)
2695 please_cite(fplog, "Berendsen84a");
2697 if (ir->etc == etcVRESCALE)
2699 please_cite(fplog, "Bussi2007a");
2701 if (ir->eI == eiSD1)
2703 please_cite(fplog, "Goga2012");
2710 *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2712 *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2713 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2716 /* Initiate variables */
2717 clear_mat(force_vir);
2718 clear_mat(shake_vir);