2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/domdec/dlbtiming.h"
53 #include "gromacs/domdec/domdec.h"
54 #include "gromacs/domdec/domdec_struct.h"
55 #include "gromacs/essentialdynamics/edsam.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/gmxlib/chargegroup.h"
58 #include "gromacs/gmxlib/network.h"
59 #include "gromacs/gmxlib/nrnb.h"
60 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
61 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
62 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
63 #include "gromacs/imd/imd.h"
64 #include "gromacs/listed-forces/bonded.h"
65 #include "gromacs/listed-forces/disre.h"
66 #include "gromacs/listed-forces/orires.h"
67 #include "gromacs/math/functions.h"
68 #include "gromacs/math/units.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/math/vecdump.h"
71 #include "gromacs/mdlib/calcmu.h"
72 #include "gromacs/mdlib/constr.h"
73 #include "gromacs/mdlib/force.h"
74 #include "gromacs/mdlib/forcerec.h"
75 #include "gromacs/mdlib/genborn.h"
76 #include "gromacs/mdlib/gmx_omp_nthreads.h"
77 #include "gromacs/mdlib/mdrun.h"
78 #include "gromacs/mdlib/nb_verlet.h"
79 #include "gromacs/mdlib/nbnxn_atomdata.h"
80 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
81 #include "gromacs/mdlib/nbnxn_grid.h"
82 #include "gromacs/mdlib/nbnxn_search.h"
83 #include "gromacs/mdlib/qmmm.h"
84 #include "gromacs/mdlib/update.h"
85 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
86 #include "gromacs/mdtypes/commrec.h"
87 #include "gromacs/mdtypes/iforceprovider.h"
88 #include "gromacs/mdtypes/inputrec.h"
89 #include "gromacs/mdtypes/md_enums.h"
90 #include "gromacs/mdtypes/state.h"
91 #include "gromacs/pbcutil/ishift.h"
92 #include "gromacs/pbcutil/mshift.h"
93 #include "gromacs/pbcutil/pbc.h"
94 #include "gromacs/pulling/pull.h"
95 #include "gromacs/pulling/pull_rotation.h"
96 #include "gromacs/timing/cyclecounter.h"
97 #include "gromacs/timing/gpu_timing.h"
98 #include "gromacs/timing/wallcycle.h"
99 #include "gromacs/timing/wallcyclereporting.h"
100 #include "gromacs/timing/walltime_accounting.h"
101 #include "gromacs/utility/arrayref.h"
102 #include "gromacs/utility/basedefinitions.h"
103 #include "gromacs/utility/cstringutil.h"
104 #include "gromacs/utility/exceptions.h"
105 #include "gromacs/utility/fatalerror.h"
106 #include "gromacs/utility/gmxassert.h"
107 #include "gromacs/utility/gmxmpi.h"
108 #include "gromacs/utility/logger.h"
109 #include "gromacs/utility/pleasecite.h"
110 #include "gromacs/utility/smalloc.h"
111 #include "gromacs/utility/sysinfo.h"
113 #include "nbnxn_gpu.h"
114 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
116 void print_time(FILE *out,
117 gmx_walltime_accounting_t walltime_accounting,
120 t_commrec gmx_unused *cr)
123 char timebuf[STRLEN];
124 double dt, elapsed_seconds, time_per_step;
133 fprintf(out, "step %s", gmx_step_str(step, buf));
136 if ((step >= ir->nstlist))
138 double seconds_since_epoch = gmx_gettime();
139 elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
140 time_per_step = elapsed_seconds/(step - ir->init_step + 1);
141 dt = (ir->nsteps + ir->init_step - step) * time_per_step;
147 finish = (time_t) (seconds_since_epoch + dt);
148 gmx_ctime_r(&finish, timebuf, STRLEN);
149 sprintf(buf, "%s", timebuf);
150 buf[strlen(buf)-1] = '\0';
151 fprintf(out, ", will finish %s", buf);
155 fprintf(out, ", remaining wall clock time: %5d s ", (int)dt);
160 fprintf(out, " performance: %.1f ns/day ",
161 ir->delta_t/1000*24*60*60/time_per_step);
174 void print_date_and_time(FILE *fplog, int nodeid, const char *title,
177 char time_string[STRLEN];
186 char timebuf[STRLEN];
187 time_t temp_time = (time_t) the_time;
189 gmx_ctime_r(&temp_time, timebuf, STRLEN);
190 for (i = 0; timebuf[i] >= ' '; i++)
192 time_string[i] = timebuf[i];
194 time_string[i] = '\0';
197 fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
200 void print_start(FILE *fplog, t_commrec *cr,
201 gmx_walltime_accounting_t walltime_accounting,
206 sprintf(buf, "Started %s", name);
207 print_date_and_time(fplog, cr->nodeid, buf,
208 walltime_accounting_get_start_time_stamp(walltime_accounting));
211 static void sum_forces(rvec f[], const PaddedRVecVector *forceToAdd)
213 /* TODO: remove this - 1 when padding is properly implemented */
214 int end = forceToAdd->size() - 1;
215 const rvec *fAdd = as_rvec_array(forceToAdd->data());
217 // cppcheck-suppress unreadVariable
218 int gmx_unused nt = gmx_omp_nthreads_get(emntDefault);
219 #pragma omp parallel for num_threads(nt) schedule(static)
220 for (int i = 0; i < end; i++)
222 rvec_inc(f[i], fAdd[i]);
226 static void calc_virial(int start, int homenr, rvec x[], rvec f[],
227 tensor vir_part, t_graph *graph, matrix box,
228 t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
232 /* The short-range virial from surrounding boxes */
234 calc_vir(SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
235 inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
237 /* Calculate partial virial, for local atoms only, based on short range.
238 * Total virial is computed in global_stat, called from do_md
240 f_calc_vir(start, start+homenr, x, f, vir_part, graph, box);
241 inc_nrnb(nrnb, eNR_VIRIAL, homenr);
243 /* Add position restraint contribution */
244 for (i = 0; i < DIM; i++)
246 vir_part[i][i] += fr->vir_diag_posres[i];
249 /* Add wall contribution */
250 for (i = 0; i < DIM; i++)
252 vir_part[i][ZZ] += fr->vir_wall_z[i];
257 pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
261 static void pull_potential_wrapper(t_commrec *cr,
263 matrix box, rvec x[],
267 gmx_enerdata_t *enerd,
270 gmx_wallcycle_t wcycle)
275 /* Calculate the center of mass forces, this requires communication,
276 * which is why pull_potential is called close to other communication.
277 * The virial contribution is calculated directly,
278 * which is why we call pull_potential after calc_virial.
280 wallcycle_start(wcycle, ewcPULLPOT);
281 set_pbc(&pbc, ir->ePBC, box);
283 enerd->term[F_COM_PULL] +=
284 pull_potential(ir->pull_work, mdatoms, &pbc,
285 cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
286 enerd->dvdl_lin[efptRESTRAINT] += dvdl;
287 wallcycle_stop(wcycle, ewcPULLPOT);
290 static void pme_receive_force_ener(t_commrec *cr,
291 gmx_wallcycle_t wcycle,
292 gmx_enerdata_t *enerd,
295 real e_q, e_lj, dvdl_q, dvdl_lj;
296 float cycles_ppdpme, cycles_seppme;
298 cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
299 dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
301 /* In case of node-splitting, the PP nodes receive the long-range
302 * forces, virial and energy from the PME nodes here.
304 wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
307 gmx_pme_receive_f(cr, as_rvec_array(fr->f_novirsum->data()), fr->vir_el_recip, &e_q,
308 fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
310 enerd->term[F_COUL_RECIP] += e_q;
311 enerd->term[F_LJ_RECIP] += e_lj;
312 enerd->dvdl_lin[efptCOUL] += dvdl_q;
313 enerd->dvdl_lin[efptVDW] += dvdl_lj;
317 dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
319 wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
322 static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
323 gmx_int64_t step, real forceTolerance,
324 const rvec *x, const rvec *f)
326 real force2Tolerance = gmx::square(forceTolerance);
327 std::uintmax_t numNonFinite = 0;
328 for (int i = 0; i < md->homenr; i++)
330 real force2 = norm2(f[i]);
331 bool nonFinite = !std::isfinite(force2);
332 if (force2 >= force2Tolerance || nonFinite)
334 fprintf(fp, "step %" GMX_PRId64 " atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
336 ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], std::sqrt(force2));
343 if (numNonFinite > 0)
345 /* Note that with MPI this fatal call on one rank might interrupt
346 * the printing on other ranks. But we can only avoid that with
347 * an expensive MPI barrier that we would need at each step.
349 gmx_fatal(FARGS, "At step %" GMX_PRId64 " detected non-finite forces on %ju atoms", step, numNonFinite);
353 static void post_process_forces(t_commrec *cr,
355 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
357 matrix box, rvec x[],
362 t_forcerec *fr, gmx_vsite_t *vsite,
369 /* Spread the mesh force on virtual sites to the other particles...
370 * This is parallellized. MPI communication is performed
371 * if the constructing atoms aren't local.
373 wallcycle_start(wcycle, ewcVSITESPREAD);
374 spread_vsite_f(vsite, x, as_rvec_array(fr->f_novirsum->data()), nullptr,
375 (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
377 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
378 wallcycle_stop(wcycle, ewcVSITESPREAD);
380 if (flags & GMX_FORCE_VIRIAL)
382 /* Now add the forces, this is local */
383 sum_forces(f, fr->f_novirsum);
385 if (EEL_FULL(fr->eeltype))
387 /* Add the mesh contribution to the virial */
388 m_add(vir_force, fr->vir_el_recip, vir_force);
390 if (EVDW_PME(fr->vdwtype))
392 /* Add the mesh contribution to the virial */
393 m_add(vir_force, fr->vir_lj_recip, vir_force);
397 pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
402 if (fr->print_force >= 0)
404 print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
408 static void do_nb_verlet(t_forcerec *fr,
409 interaction_const_t *ic,
410 gmx_enerdata_t *enerd,
411 int flags, int ilocality,
414 gmx_wallcycle_t wcycle)
416 int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
417 nonbonded_verlet_group_t *nbvg;
418 gmx_bool bUsingGpuKernels;
420 if (!(flags & GMX_FORCE_NONBONDED))
422 /* skip non-bonded calculation */
426 nbvg = &fr->nbv->grp[ilocality];
428 /* GPU kernel launch overhead is already timed separately */
429 if (fr->cutoff_scheme != ecutsVERLET)
431 gmx_incons("Invalid cut-off scheme passed!");
434 bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
436 if (!bUsingGpuKernels)
438 wallcycle_sub_start(wcycle, ewcsNONBONDED);
440 switch (nbvg->kernel_type)
442 case nbnxnk4x4_PlainC:
443 case nbnxnk4xN_SIMD_4xN:
444 case nbnxnk4xN_SIMD_2xNN:
445 nbnxn_kernel_cpu(nbvg,
451 enerd->grpp.ener[egCOULSR],
453 enerd->grpp.ener[egBHAMSR] :
454 enerd->grpp.ener[egLJSR]);
457 case nbnxnk8x8x8_GPU:
458 nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
461 case nbnxnk8x8x8_PlainC:
462 nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
467 nbvg->nbat->out[0].f,
469 enerd->grpp.ener[egCOULSR],
471 enerd->grpp.ener[egBHAMSR] :
472 enerd->grpp.ener[egLJSR]);
476 gmx_incons("Invalid nonbonded kernel type passed!");
479 if (!bUsingGpuKernels)
481 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
484 if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
486 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
488 else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
489 (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
491 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
495 enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
497 enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
498 if (flags & GMX_FORCE_ENERGY)
500 /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
501 enr_nbnxn_kernel_ljc += 1;
502 enr_nbnxn_kernel_lj += 1;
505 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
506 nbvg->nbl_lists.natpair_ljq);
507 inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
508 nbvg->nbl_lists.natpair_lj);
509 /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
510 inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
511 nbvg->nbl_lists.natpair_q);
513 if (ic->vdw_modifier == eintmodFORCESWITCH)
515 /* We add up the switch cost separately */
516 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
517 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
519 if (ic->vdw_modifier == eintmodPOTSWITCH)
521 /* We add up the switch cost separately */
522 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
523 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
525 if (ic->vdwtype == evdwPME)
527 /* We add up the LJ Ewald cost separately */
528 inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
529 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
533 static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
540 gmx_enerdata_t *enerd,
543 gmx_wallcycle_t wcycle)
546 nb_kernel_data_t kernel_data;
548 real dvdl_nb[efptNR];
553 /* Add short-range interactions */
554 donb_flags |= GMX_NONBONDED_DO_SR;
556 /* Currently all group scheme kernels always calculate (shift-)forces */
557 if (flags & GMX_FORCE_FORCES)
559 donb_flags |= GMX_NONBONDED_DO_FORCE;
561 if (flags & GMX_FORCE_VIRIAL)
563 donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
565 if (flags & GMX_FORCE_ENERGY)
567 donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
570 kernel_data.flags = donb_flags;
571 kernel_data.lambda = lambda;
572 kernel_data.dvdl = dvdl_nb;
574 kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
575 kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR];
577 /* reset free energy components */
578 for (i = 0; i < efptNR; i++)
583 assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
585 wallcycle_sub_start(wcycle, ewcsNONBONDED);
586 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
587 for (th = 0; th < nbl_lists->nnbl; th++)
591 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
592 x, f, fr, mdatoms, &kernel_data, nrnb);
594 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
597 if (fepvals->sc_alpha != 0)
599 enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
600 enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
604 enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
605 enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
608 /* If we do foreign lambda and we have soft-core interactions
609 * we have to recalculate the (non-linear) energies contributions.
611 if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
613 kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
614 kernel_data.lambda = lam_i;
615 kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
616 kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR];
617 /* Note that we add to kernel_data.dvdl, but ignore the result */
619 for (i = 0; i < enerd->n_lambda; i++)
621 for (j = 0; j < efptNR; j++)
623 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
625 reset_foreign_enerdata(enerd);
626 #pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
627 for (th = 0; th < nbl_lists->nnbl; th++)
631 gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
632 x, f, fr, mdatoms, &kernel_data, nrnb);
634 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
637 sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
638 enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
642 wallcycle_sub_stop(wcycle, ewcsNONBONDED);
645 gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
647 return nbv != nullptr && nbv->bUseGPU;
650 static gmx_inline void clear_rvecs_omp(int n, rvec v[])
652 int nth = gmx_omp_nthreads_get_simple_rvec_task(emntDefault, n);
654 /* Note that we would like to avoid this conditional by putting it
655 * into the omp pragma instead, but then we still take the full
656 * omp parallel for overhead (at least with gcc5).
660 for (int i = 0; i < n; i++)
667 #pragma omp parallel for num_threads(nth) schedule(static)
668 for (int i = 0; i < n; i++)
675 /*! \brief This routine checks if the potential energy is finite.
677 * Note that passing this check does not guarantee finite forces,
678 * since those use slightly different arithmetics. But in most cases
679 * there is just a narrow coordinate range where forces are not finite
680 * and energies are finite.
682 * \param[in] enerd The energy data; the non-bonded group energies need to be added in here before calling this routine
684 static void checkPotentialEnergyValidity(const gmx_enerdata_t *enerd)
686 if (!std::isfinite(enerd->term[F_EPOT]))
688 gmx_fatal(FARGS, "The total potential energy is %g, which is not finite. The LJ and electrostatic contributions to the energy are %g and %g, respectively. A non-finite potential energy can be caused by overlapping interactions in bonded interactions or very large or NaN coordinate values. Usually this is caused by a badly or non-equilibrated initial configuration or incorrect interactions or parameters in the topology.",
691 enerd->term[F_COUL_SR]);
695 void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
696 t_inputrec *inputrec,
697 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
699 gmx_groups_t gmx_unused *groups,
700 matrix box, rvec x[], history_t *hist,
701 PaddedRVecVector *force,
704 gmx_enerdata_t *enerd, t_fcdata *fcd,
705 real *lambda, t_graph *graph,
706 t_forcerec *fr, interaction_const_t *ic,
707 gmx_vsite_t *vsite, rvec mu_tot,
708 double t, gmx_edsam_t ed,
711 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
712 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
716 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
717 gmx_bool bDoForces, bUseGPU, bUseOrEmulGPU;
718 gmx_bool bDiffKernels = FALSE;
719 rvec vzero, box_diag;
720 float cycles_pme, cycles_wait_gpu;
721 nonbonded_verlet_t *nbv = fr->nbv;
723 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
724 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
725 bFillGrid = (bNS && bStateChanged);
726 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
727 bDoForces = (flags & GMX_FORCE_FORCES);
728 bUseGPU = fr->nbv->bUseGPU;
729 bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu;
731 /* At a search step we need to start the first balancing region
732 * somewhere early inside the step after communication during domain
733 * decomposition (and not during the previous step as usual).
736 ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
738 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::yes);
744 const int homenr = mdatoms->homenr;
746 clear_mat(vir_force);
748 if (DOMAINDECOMP(cr))
750 cg1 = cr->dd->ncg_tot;
763 update_forcerec(fr, box);
765 if (inputrecNeedMutot(inputrec))
767 /* Calculate total (local) dipole moment in a temporary common array.
768 * This makes it possible to sum them over nodes faster.
770 calc_mu(start, homenr,
771 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
776 if (fr->ePBC != epbcNONE)
778 /* Compute shift vectors every step,
779 * because of pressure coupling or box deformation!
781 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
783 calc_shifts(box, fr->shift_vec);
788 put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
789 inc_nrnb(nrnb, eNR_SHIFTX, homenr);
791 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
793 unshift_self(graph, box, x);
797 nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
798 fr->shift_vec, nbv->grp[0].nbat);
801 if (!(cr->duty & DUTY_PME))
806 /* Send particle coordinates to the pme nodes.
807 * Since this is only implemented for domain decomposition
808 * and domain decomposition does not use the graph,
809 * we do not need to worry about shifting.
812 wallcycle_start(wcycle, ewcPP_PMESENDX);
814 bBS = (inputrec->nwall == 2);
818 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
821 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
822 lambda[efptCOUL], lambda[efptVDW],
823 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
826 wallcycle_stop(wcycle, ewcPP_PMESENDX);
830 /* do gridding for pair search */
833 if (graph && bStateChanged)
835 /* Calculate intramolecular shift vectors to make molecules whole */
836 mk_mshift(fplog, graph, fr->ePBC, box, x);
840 box_diag[XX] = box[XX][XX];
841 box_diag[YY] = box[YY][YY];
842 box_diag[ZZ] = box[ZZ][ZZ];
844 wallcycle_start(wcycle, ewcNS);
847 wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
848 nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
850 0, mdatoms->homenr, -1, fr->cginfo, x,
852 nbv->grp[eintLocal].kernel_type,
853 nbv->grp[eintLocal].nbat);
854 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
858 wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
859 nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
861 nbv->grp[eintNonlocal].kernel_type,
862 nbv->grp[eintNonlocal].nbat);
863 wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
866 if (nbv->ngrp == 1 ||
867 nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
869 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
870 nbv->nbs, mdatoms, fr->cginfo);
874 nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
875 nbv->nbs, mdatoms, fr->cginfo);
876 nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
877 nbv->nbs, mdatoms, fr->cginfo);
879 wallcycle_stop(wcycle, ewcNS);
882 /* initialize the GPU atom data and copy shift vector */
887 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
888 nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
889 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
892 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
893 nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
894 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
897 /* do local pair search */
900 wallcycle_start_nocount(wcycle, ewcNS);
901 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
902 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
905 nbv->min_ci_balanced,
906 &nbv->grp[eintLocal].nbl_lists,
908 nbv->grp[eintLocal].kernel_type,
910 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
914 /* initialize local pair-list on the GPU */
915 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
916 nbv->grp[eintLocal].nbl_lists.nbl[0],
919 wallcycle_stop(wcycle, ewcNS);
923 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
924 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
925 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
926 nbv->grp[eintLocal].nbat);
927 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
928 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
933 if (DOMAINDECOMP(cr))
935 ddOpenBalanceRegionGpu(cr->dd);
938 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
939 /* launch local nonbonded F on GPU */
940 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
942 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
945 /* Communicate coordinates and sum dipole if necessary +
946 do non-local pair search */
947 if (DOMAINDECOMP(cr))
949 bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
950 nbv->grp[eintLocal].kernel_type);
954 /* With GPU+CPU non-bonded calculations we need to copy
955 * the local coordinates to the non-local nbat struct
956 * (in CPU format) as the non-local kernel call also
957 * calculates the local - non-local interactions.
959 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
960 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
961 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
962 nbv->grp[eintNonlocal].nbat);
963 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
964 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
969 wallcycle_start_nocount(wcycle, ewcNS);
970 wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
974 nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
977 nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
980 nbv->min_ci_balanced,
981 &nbv->grp[eintNonlocal].nbl_lists,
983 nbv->grp[eintNonlocal].kernel_type,
986 wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
988 if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
990 /* initialize non-local pair-list on the GPU */
991 nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
992 nbv->grp[eintNonlocal].nbl_lists.nbl[0],
995 wallcycle_stop(wcycle, ewcNS);
999 wallcycle_start(wcycle, ewcMOVEX);
1000 dd_move_x(cr->dd, box, x);
1001 wallcycle_stop(wcycle, ewcMOVEX);
1003 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1004 wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
1005 nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
1006 nbv->grp[eintNonlocal].nbat);
1007 wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
1008 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1011 if (bUseGPU && !bDiffKernels)
1013 wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
1014 /* launch non-local nonbonded F on GPU */
1015 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
1017 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1023 /* launch D2H copy-back F */
1024 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1025 if (DOMAINDECOMP(cr) && !bDiffKernels)
1027 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
1028 flags, eatNonlocal);
1030 nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
1032 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1035 if (bStateChanged && inputrecNeedMutot(inputrec))
1039 gmx_sumd(2*DIM, mu, cr);
1040 ddReopenBalanceRegionCpu(cr->dd);
1043 for (i = 0; i < 2; i++)
1045 for (j = 0; j < DIM; j++)
1047 fr->mu_tot[i][j] = mu[i*DIM + j];
1051 if (fr->efep == efepNO)
1053 copy_rvec(fr->mu_tot[0], mu_tot);
1057 for (j = 0; j < DIM; j++)
1060 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
1061 lambda[efptCOUL]*fr->mu_tot[1][j];
1065 /* Reset energies */
1066 reset_enerdata(enerd);
1067 clear_rvecs(SHIFTS, fr->fshift);
1069 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1071 wallcycle_start(wcycle, ewcPPDURINGPME);
1072 dd_force_flop_start(cr->dd, nrnb);
1077 wallcycle_start(wcycle, ewcROT);
1078 do_rotation(cr, inputrec, box, x, t, step, bNS);
1079 wallcycle_stop(wcycle, ewcROT);
1082 /* Temporary solution until all routines take PaddedRVecVector */
1083 rvec *f = as_rvec_array(force->data());
1085 /* Start the force cycle counter.
1086 * Note that a different counter is used for dynamic load balancing.
1088 wallcycle_start(wcycle, ewcFORCE);
1091 /* Reset forces for which the virial is calculated separately:
1092 * PME/Ewald forces if necessary */
1093 if (fr->bF_NoVirSum)
1095 if (flags & GMX_FORCE_VIRIAL)
1097 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1101 /* We are not calculating the pressure so we do not need
1102 * a separate array for forces that do not contribute
1105 fr->f_novirsum = force;
1109 if (fr->bF_NoVirSum)
1111 if (flags & GMX_FORCE_VIRIAL)
1113 /* TODO: remove this - 1 when padding is properly implemented */
1114 clear_rvecs_omp(fr->f_novirsum->size() - 1,
1115 as_rvec_array(fr->f_novirsum->data()));
1118 /* Clear the short- and long-range forces */
1119 clear_rvecs_omp(fr->natoms_force_constr, f);
1121 clear_rvec(fr->vir_diag_posres);
1124 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1126 clear_pull_forces(inputrec->pull_work);
1129 /* We calculate the non-bonded forces, when done on the CPU, here.
1130 * We do this before calling do_force_lowlevel, because in that
1131 * function, the listed forces are calculated before PME, which
1132 * does communication. With this order, non-bonded and listed
1133 * force calculation imbalance can be balanced out by the domain
1134 * decomposition load balancing.
1139 /* Maybe we should move this into do_force_lowlevel */
1140 do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
1144 if (fr->efep != efepNO)
1146 /* Calculate the local and non-local free energy interactions here.
1147 * Happens here on the CPU both with and without GPU.
1149 if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
1151 do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
1153 inputrec->fepvals, lambda,
1154 enerd, flags, nrnb, wcycle);
1157 if (DOMAINDECOMP(cr) &&
1158 fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
1160 do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
1162 inputrec->fepvals, lambda,
1163 enerd, flags, nrnb, wcycle);
1167 if (!bUseOrEmulGPU || bDiffKernels)
1171 if (DOMAINDECOMP(cr))
1173 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
1174 bDiffKernels ? enbvClearFYes : enbvClearFNo,
1184 aloc = eintNonlocal;
1187 /* Add all the non-bonded force to the normal force array.
1188 * This can be split into a local and a non-local part when overlapping
1189 * communication with calculation with domain decomposition.
1191 wallcycle_stop(wcycle, ewcFORCE);
1192 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1193 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1194 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
1195 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1196 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1197 wallcycle_start_nocount(wcycle, ewcFORCE);
1199 /* if there are multiple fshift output buffers reduce them */
1200 if ((flags & GMX_FORCE_VIRIAL) &&
1201 nbv->grp[aloc].nbl_lists.nnbl > 1)
1203 /* This is not in a subcounter because it takes a
1204 negligible and constant-sized amount of time */
1205 nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
1210 /* update QMMMrec, if necessary */
1213 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1216 /* Compute the bonded and non-bonded energies and optionally forces */
1217 do_force_lowlevel(fr, inputrec, &(top->idef),
1218 cr, nrnb, wcycle, mdatoms,
1219 x, hist, f, enerd, fcd, top, fr->born,
1221 inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
1222 flags, &cycles_pme);
1224 wallcycle_stop(wcycle, ewcFORCE);
1228 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1231 if (bUseOrEmulGPU && !bDiffKernels)
1233 /* wait for non-local forces (or calculate in emulation mode) */
1234 if (DOMAINDECOMP(cr))
1238 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
1239 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1241 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1243 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
1247 wallcycle_start_nocount(wcycle, ewcFORCE);
1248 do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
1250 wallcycle_stop(wcycle, ewcFORCE);
1252 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1253 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1254 /* skip the reduction if there was no non-local work to do */
1255 if (nbv->grp[eintNonlocal].nbl_lists.nbl[0]->nsci > 0)
1257 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
1258 nbv->grp[eintNonlocal].nbat, f);
1260 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1261 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1265 if (DOMAINDECOMP(cr))
1267 /* We are done with the CPU compute.
1268 * We will now communicate the non-local forces.
1269 * If we use a GPU this will overlap with GPU work, so in that case
1270 * we do not close the DD force balancing region here.
1272 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1274 ddCloseBalanceRegionCpu(cr->dd);
1278 wallcycle_start(wcycle, ewcMOVEF);
1279 dd_move_f(cr->dd, f, fr->fshift);
1280 wallcycle_stop(wcycle, ewcMOVEF);
1286 /* wait for local forces (or calculate in emulation mode) */
1289 /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
1290 * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
1291 * but even with a step of 0.1 ms the difference is less than 1%
1294 const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
1296 wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
1297 nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
1299 enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
1301 float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
1303 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1305 DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
1306 if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
1308 /* We measured few cycles, it could be that the kernel
1309 * and transfer finished earlier and there was no actual
1310 * wait time, only API call overhead.
1311 * Then the actual time could be anywhere between 0 and
1312 * cycles_wait_est. We will use half of cycles_wait_est.
1314 waitedForGpu = DdBalanceRegionWaitedForGpu::no;
1316 ddCloseBalanceRegionGpu(cr->dd, cycles_wait_gpu, waitedForGpu);
1319 /* now clear the GPU outputs while we finish the step on the CPU */
1320 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
1321 nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
1322 wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
1326 wallcycle_start_nocount(wcycle, ewcFORCE);
1327 do_nb_verlet(fr, ic, enerd, flags, eintLocal,
1328 DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
1330 wallcycle_stop(wcycle, ewcFORCE);
1332 wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
1333 wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
1334 nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
1335 nbv->grp[eintLocal].nbat, f);
1336 wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
1337 wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
1340 if (DOMAINDECOMP(cr))
1342 dd_force_flop_stop(cr->dd, nrnb);
1347 /* Collect forces from modules */
1348 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1349 if (fr->bF_NoVirSum)
1351 fNoVirSum = *fr->f_novirsum;
1353 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1355 /* If we have NoVirSum forces, but we do not calculate the virial,
1356 * we sum fr->f_novirsum=f later.
1358 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1360 wallcycle_start(wcycle, ewcVSITESPREAD);
1361 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1362 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1363 wallcycle_stop(wcycle, ewcVSITESPREAD);
1366 if (flags & GMX_FORCE_VIRIAL)
1368 /* Calculation of the virial must be done after vsites! */
1369 calc_virial(0, mdatoms->homenr, x, f,
1370 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1374 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1376 /* Since the COM pulling is always done mass-weighted, no forces are
1377 * applied to vsites and this call can be done after vsite spreading.
1379 pull_potential_wrapper(cr, inputrec, box, x,
1380 f, vir_force, mdatoms, enerd, lambda, t,
1384 /* Add the forces from enforced rotation potentials (if any) */
1387 wallcycle_start(wcycle, ewcROTadd);
1388 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1389 wallcycle_stop(wcycle, ewcROTadd);
1392 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1393 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1395 if (PAR(cr) && !(cr->duty & DUTY_PME))
1397 /* In case of node-splitting, the PP nodes receive the long-range
1398 * forces, virial and energy from the PME nodes here.
1400 pme_receive_force_ener(cr, wcycle, enerd, fr);
1405 post_process_forces(cr, step, nrnb, wcycle,
1406 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1410 if (flags & GMX_FORCE_ENERGY)
1412 /* Sum the potential energy terms from group contributions */
1413 sum_epot(&(enerd->grpp), enerd->term);
1415 if (!EI_TPI(inputrec->eI))
1417 checkPotentialEnergyValidity(enerd);
1422 void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
1423 t_inputrec *inputrec,
1424 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1425 gmx_localtop_t *top,
1426 gmx_groups_t *groups,
1427 matrix box, rvec x[], history_t *hist,
1428 PaddedRVecVector *force,
1431 gmx_enerdata_t *enerd, t_fcdata *fcd,
1432 real *lambda, t_graph *graph,
1433 t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
1434 double t, gmx_edsam_t ed,
1435 gmx_bool bBornRadii,
1437 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1438 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1442 gmx_bool bStateChanged, bNS, bFillGrid, bCalcCGCM;
1446 const int start = 0;
1447 const int homenr = mdatoms->homenr;
1449 clear_mat(vir_force);
1452 if (DOMAINDECOMP(cr))
1454 cg1 = cr->dd->ncg_tot;
1465 bStateChanged = (flags & GMX_FORCE_STATECHANGED);
1466 bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
1467 /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
1468 bFillGrid = (bNS && bStateChanged);
1469 bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
1470 bDoForces = (flags & GMX_FORCE_FORCES);
1474 update_forcerec(fr, box);
1476 if (inputrecNeedMutot(inputrec))
1478 /* Calculate total (local) dipole moment in a temporary common array.
1479 * This makes it possible to sum them over nodes faster.
1481 calc_mu(start, homenr,
1482 x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
1487 if (fr->ePBC != epbcNONE)
1489 /* Compute shift vectors every step,
1490 * because of pressure coupling or box deformation!
1492 if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
1494 calc_shifts(box, fr->shift_vec);
1499 put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
1500 &(top->cgs), x, fr->cg_cm);
1501 inc_nrnb(nrnb, eNR_CGCM, homenr);
1502 inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
1504 else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
1506 unshift_self(graph, box, x);
1511 calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
1512 inc_nrnb(nrnb, eNR_CGCM, homenr);
1515 if (bCalcCGCM && gmx_debug_at)
1517 pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
1521 if (!(cr->duty & DUTY_PME))
1526 /* Send particle coordinates to the pme nodes.
1527 * Since this is only implemented for domain decomposition
1528 * and domain decomposition does not use the graph,
1529 * we do not need to worry about shifting.
1532 wallcycle_start(wcycle, ewcPP_PMESENDX);
1534 bBS = (inputrec->nwall == 2);
1537 copy_mat(box, boxs);
1538 svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
1541 gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
1542 lambda[efptCOUL], lambda[efptVDW],
1543 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
1546 wallcycle_stop(wcycle, ewcPP_PMESENDX);
1548 #endif /* GMX_MPI */
1550 /* Communicate coordinates and sum dipole if necessary */
1551 if (DOMAINDECOMP(cr))
1553 wallcycle_start(wcycle, ewcMOVEX);
1554 dd_move_x(cr->dd, box, x);
1555 wallcycle_stop(wcycle, ewcMOVEX);
1556 /* No GPU support, no move_x overlap, so reopen the balance region here */
1557 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1559 ddReopenBalanceRegionCpu(cr->dd);
1563 if (inputrecNeedMutot(inputrec))
1569 gmx_sumd(2*DIM, mu, cr);
1570 ddReopenBalanceRegionCpu(cr->dd);
1572 for (i = 0; i < 2; i++)
1574 for (j = 0; j < DIM; j++)
1576 fr->mu_tot[i][j] = mu[i*DIM + j];
1580 if (fr->efep == efepNO)
1582 copy_rvec(fr->mu_tot[0], mu_tot);
1586 for (j = 0; j < DIM; j++)
1589 (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
1594 /* Reset energies */
1595 reset_enerdata(enerd);
1596 clear_rvecs(SHIFTS, fr->fshift);
1600 wallcycle_start(wcycle, ewcNS);
1602 if (graph && bStateChanged)
1604 /* Calculate intramolecular shift vectors to make molecules whole */
1605 mk_mshift(fplog, graph, fr->ePBC, box, x);
1608 /* Do the actual neighbour searching */
1610 groups, top, mdatoms,
1611 cr, nrnb, bFillGrid);
1613 wallcycle_stop(wcycle, ewcNS);
1616 if (inputrec->implicit_solvent && bNS)
1618 make_gb_nblist(cr, inputrec->gb_algorithm,
1619 x, box, fr, &top->idef, graph, fr->born);
1622 if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
1624 wallcycle_start(wcycle, ewcPPDURINGPME);
1625 dd_force_flop_start(cr->dd, nrnb);
1630 wallcycle_start(wcycle, ewcROT);
1631 do_rotation(cr, inputrec, box, x, t, step, bNS);
1632 wallcycle_stop(wcycle, ewcROT);
1635 /* Temporary solution until all routines take PaddedRVecVector */
1636 rvec *f = as_rvec_array(force->data());
1638 /* Start the force cycle counter.
1639 * Note that a different counter is used for dynamic load balancing.
1641 wallcycle_start(wcycle, ewcFORCE);
1645 /* Reset forces for which the virial is calculated separately:
1646 * PME/Ewald forces if necessary */
1647 if (fr->bF_NoVirSum)
1649 if (flags & GMX_FORCE_VIRIAL)
1651 fr->f_novirsum = fr->forceBufferNoVirialSummation;
1652 /* TODO: remove this - 1 when padding is properly implemented */
1653 clear_rvecs(fr->f_novirsum->size() - 1,
1654 as_rvec_array(fr->f_novirsum->data()));
1658 /* We are not calculating the pressure so we do not need
1659 * a separate array for forces that do not contribute
1662 fr->f_novirsum = force;
1666 /* Clear the short- and long-range forces */
1667 clear_rvecs(fr->natoms_force_constr, f);
1669 clear_rvec(fr->vir_diag_posres);
1671 if (inputrec->bPull && pull_have_constraint(inputrec->pull_work))
1673 clear_pull_forces(inputrec->pull_work);
1676 /* update QMMMrec, if necessary */
1679 update_QMMMrec(cr, fr, x, mdatoms, box, top);
1682 /* Compute the bonded and non-bonded energies and optionally forces */
1683 do_force_lowlevel(fr, inputrec, &(top->idef),
1684 cr, nrnb, wcycle, mdatoms,
1685 x, hist, f, enerd, fcd, top, fr->born,
1687 inputrec->fepvals, lambda,
1688 graph, &(top->excls), fr->mu_tot,
1692 wallcycle_stop(wcycle, ewcFORCE);
1694 if (DOMAINDECOMP(cr))
1696 dd_force_flop_stop(cr->dd, nrnb);
1698 if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
1700 ddCloseBalanceRegionCpu(cr->dd);
1706 do_flood(cr, inputrec, x, f, ed, box, step, bNS);
1711 /* Collect forces from modules */
1712 gmx::ArrayRef<gmx::RVec> fNoVirSum;
1713 if (fr->bF_NoVirSum)
1715 fNoVirSum = *fr->f_novirsum;
1717 fr->forceProviders->calculateForces(cr, mdatoms, box, t, x, *force, fNoVirSum);
1719 /* Communicate the forces */
1720 if (DOMAINDECOMP(cr))
1722 wallcycle_start(wcycle, ewcMOVEF);
1723 dd_move_f(cr->dd, f, fr->fshift);
1724 /* Do we need to communicate the separate force array
1725 * for terms that do not contribute to the single sum virial?
1726 * Position restraints and electric fields do not introduce
1727 * inter-cg forces, only full electrostatics methods do.
1728 * When we do not calculate the virial, fr->f_novirsum = f,
1729 * so we have already communicated these forces.
1731 if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
1732 (flags & GMX_FORCE_VIRIAL))
1734 dd_move_f(cr->dd, as_rvec_array(fr->f_novirsum->data()), nullptr);
1736 wallcycle_stop(wcycle, ewcMOVEF);
1739 /* If we have NoVirSum forces, but we do not calculate the virial,
1740 * we sum fr->f_novirsum=f later.
1742 if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
1744 wallcycle_start(wcycle, ewcVSITESPREAD);
1745 spread_vsite_f(vsite, x, f, fr->fshift, FALSE, nullptr, nrnb,
1746 &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
1747 wallcycle_stop(wcycle, ewcVSITESPREAD);
1750 if (flags & GMX_FORCE_VIRIAL)
1752 /* Calculation of the virial must be done after vsites! */
1753 calc_virial(0, mdatoms->homenr, x, f,
1754 vir_force, graph, box, nrnb, fr, inputrec->ePBC);
1758 if (inputrec->bPull && pull_have_potential(inputrec->pull_work))
1760 pull_potential_wrapper(cr, inputrec, box, x,
1761 f, vir_force, mdatoms, enerd, lambda, t,
1765 /* Add the forces from enforced rotation potentials (if any) */
1768 wallcycle_start(wcycle, ewcROTadd);
1769 enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
1770 wallcycle_stop(wcycle, ewcROTadd);
1773 /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
1774 IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
1776 if (PAR(cr) && !(cr->duty & DUTY_PME))
1778 /* In case of node-splitting, the PP nodes receive the long-range
1779 * forces, virial and energy from the PME nodes here.
1781 pme_receive_force_ener(cr, wcycle, enerd, fr);
1786 post_process_forces(cr, step, nrnb, wcycle,
1787 top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
1791 if (flags & GMX_FORCE_ENERGY)
1793 /* Sum the potential energy terms from group contributions */
1794 sum_epot(&(enerd->grpp), enerd->term);
1796 if (!EI_TPI(inputrec->eI))
1798 checkPotentialEnergyValidity(enerd);
1804 void do_force(FILE *fplog, t_commrec *cr,
1805 t_inputrec *inputrec,
1806 gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
1807 gmx_localtop_t *top,
1808 gmx_groups_t *groups,
1809 matrix box, PaddedRVecVector *coordinates, history_t *hist,
1810 PaddedRVecVector *force,
1813 gmx_enerdata_t *enerd, t_fcdata *fcd,
1814 gmx::ArrayRef<real> lambda, t_graph *graph,
1816 gmx_vsite_t *vsite, rvec mu_tot,
1817 double t, gmx_edsam_t ed,
1818 gmx_bool bBornRadii,
1820 DdOpenBalanceRegionBeforeForceComputation ddOpenBalanceRegion,
1821 DdCloseBalanceRegionAfterForceComputation ddCloseBalanceRegion)
1823 /* modify force flag if not doing nonbonded */
1824 if (!fr->bNonbonded)
1826 flags &= ~GMX_FORCE_NONBONDED;
1829 GMX_ASSERT(coordinates->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1830 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1831 GMX_ASSERT(force->size() >= static_cast<unsigned int>(fr->natoms_force + 1) ||
1832 fr->natoms_force == 0, "We might need 1 element extra for SIMD");
1834 rvec *x = as_rvec_array(coordinates->data());
1836 switch (inputrec->cutoff_scheme)
1839 do_force_cutsVERLET(fplog, cr, inputrec,
1847 lambda.data(), graph,
1853 ddOpenBalanceRegion,
1854 ddCloseBalanceRegion);
1857 do_force_cutsGROUP(fplog, cr, inputrec,
1865 lambda.data(), graph,
1870 ddOpenBalanceRegion,
1871 ddCloseBalanceRegion);
1874 gmx_incons("Invalid cut-off scheme passed!");
1877 /* In case we don't have constraints and are using GPUs, the next balancing
1878 * region starts here.
1879 * Some "special" work at the end of do_force_cuts?, such as vsite spread,
1880 * virial calculation and COM pulling, is not thus not included in
1881 * the balance timing, which is ok as most tasks do communication.
1883 if (ddOpenBalanceRegion == DdOpenBalanceRegionBeforeForceComputation::yes)
1885 ddOpenBalanceRegionCpu(cr->dd, DdAllowBalanceRegionReopen::no);
1890 void do_constrain_first(FILE *fplog, gmx_constr_t constr,
1891 t_inputrec *ir, t_mdatoms *md,
1892 t_state *state, t_commrec *cr, t_nrnb *nrnb,
1893 t_forcerec *fr, gmx_localtop_t *top)
1895 int i, m, start, end;
1897 real dt = ir->delta_t;
1901 /* We need to allocate one element extra, since we might use
1902 * (unaligned) 4-wide SIMD loads to access rvec entries.
1904 snew(savex, state->natoms + 1);
1911 fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
1912 start, md->homenr, end);
1914 /* Do a first constrain to reset particles... */
1915 step = ir->init_step;
1918 char buf[STEPSTRSIZE];
1919 fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
1920 gmx_step_str(step, buf));
1924 /* constrain the current position */
1925 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1926 ir, cr, step, 0, 1.0, md,
1927 as_rvec_array(state->x.data()), as_rvec_array(state->x.data()), nullptr,
1928 fr->bMolPBC, state->box,
1929 state->lambda[efptBONDED], &dvdl_dum,
1930 nullptr, nullptr, nrnb, econqCoord);
1933 /* constrain the inital velocity, and save it */
1934 /* also may be useful if we need the ekin from the halfstep for velocity verlet */
1935 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1936 ir, cr, step, 0, 1.0, md,
1937 as_rvec_array(state->x.data()), as_rvec_array(state->v.data()), as_rvec_array(state->v.data()),
1938 fr->bMolPBC, state->box,
1939 state->lambda[efptBONDED], &dvdl_dum,
1940 nullptr, nullptr, nrnb, econqVeloc);
1942 /* constrain the inital velocities at t-dt/2 */
1943 if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
1945 for (i = start; (i < end); i++)
1947 for (m = 0; (m < DIM); m++)
1949 /* Reverse the velocity */
1950 state->v[i][m] = -state->v[i][m];
1951 /* Store the position at t-dt in buf */
1952 savex[i][m] = state->x[i][m] + dt*state->v[i][m];
1955 /* Shake the positions at t=-dt with the positions at t=0
1956 * as reference coordinates.
1960 char buf[STEPSTRSIZE];
1961 fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
1962 gmx_step_str(step, buf));
1965 constrain(nullptr, TRUE, FALSE, constr, &(top->idef),
1966 ir, cr, step, -1, 1.0, md,
1967 as_rvec_array(state->x.data()), savex, nullptr,
1968 fr->bMolPBC, state->box,
1969 state->lambda[efptBONDED], &dvdl_dum,
1970 as_rvec_array(state->v.data()), nullptr, nrnb, econqCoord);
1972 for (i = start; i < end; i++)
1974 for (m = 0; m < DIM; m++)
1976 /* Re-reverse the velocities */
1977 state->v[i][m] = -state->v[i][m];
1986 integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
1987 double *enerout, double *virout)
1989 double enersum, virsum;
1990 double invscale, invscale2, invscale3;
1991 double r, ea, eb, ec, pa, pb, pc, pd;
1996 invscale = 1.0/scale;
1997 invscale2 = invscale*invscale;
1998 invscale3 = invscale*invscale2;
2000 /* Following summation derived from cubic spline definition,
2001 * Numerical Recipies in C, second edition, p. 113-116. Exact for
2002 * the cubic spline. We first calculate the negative of the
2003 * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
2004 * add the more standard, abrupt cutoff correction to that result,
2005 * yielding the long-range correction for a switched function. We
2006 * perform both the pressure and energy loops at the same time for
2007 * simplicity, as the computational cost is low. */
2011 /* Since the dispersion table has been scaled down a factor
2012 * 6.0 and the repulsion a factor 12.0 to compensate for the
2013 * c6/c12 parameters inside nbfp[] being scaled up (to save
2014 * flops in kernels), we need to correct for this.
2025 for (ri = rstart; ri < rend; ++ri)
2029 eb = 2.0*invscale2*r;
2033 pb = 3.0*invscale2*r;
2034 pc = 3.0*invscale*r*r;
2037 /* this "8" is from the packing in the vdwtab array - perhaps
2038 should be defined? */
2040 offset = 8*ri + offstart;
2041 y0 = vdwtab[offset];
2042 f = vdwtab[offset+1];
2043 g = vdwtab[offset+2];
2044 h = vdwtab[offset+3];
2046 enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
2047 virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
2049 *enerout = 4.0*M_PI*enersum*tabfactor;
2050 *virout = 4.0*M_PI*virsum*tabfactor;
2053 void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
2055 double eners[2], virs[2], enersum, virsum;
2056 double r0, rc3, rc9;
2058 real scale, *vdwtab;
2060 fr->enershiftsix = 0;
2061 fr->enershifttwelve = 0;
2062 fr->enerdiffsix = 0;
2063 fr->enerdifftwelve = 0;
2065 fr->virdifftwelve = 0;
2067 if (eDispCorr != edispcNO)
2069 for (i = 0; i < 2; i++)
2074 if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
2075 (fr->vdw_modifier == eintmodPOTSWITCH) ||
2076 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2077 (fr->vdwtype == evdwSHIFT) ||
2078 (fr->vdwtype == evdwSWITCH))
2080 if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
2081 (fr->vdw_modifier == eintmodFORCESWITCH) ||
2082 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
2085 "With dispersion correction rvdw-switch can not be zero "
2086 "for vdw-type = %s", evdw_names[fr->vdwtype]);
2089 /* TODO This code depends on the logic in tables.c that
2090 constructs the table layout, which should be made
2091 explicit in future cleanup. */
2092 GMX_ASSERT(fr->dispersionCorrectionTable->interaction == GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
2093 "Dispersion-correction code needs a table with both repulsion and dispersion terms");
2094 scale = fr->dispersionCorrectionTable->scale;
2095 vdwtab = fr->dispersionCorrectionTable->data;
2097 /* Round the cut-offs to exact table values for precision */
2098 ri0 = static_cast<int>(floor(fr->rvdw_switch*scale));
2099 ri1 = static_cast<int>(ceil(fr->rvdw*scale));
2101 /* The code below has some support for handling force-switching, i.e.
2102 * when the force (instead of potential) is switched over a limited
2103 * region. This leads to a constant shift in the potential inside the
2104 * switching region, which we can handle by adding a constant energy
2105 * term in the force-switch case just like when we do potential-shift.
2107 * For now this is not enabled, but to keep the functionality in the
2108 * code we check separately for switch and shift. When we do force-switch
2109 * the shifting point is rvdw_switch, while it is the cutoff when we
2110 * have a classical potential-shift.
2112 * For a pure potential-shift the potential has a constant shift
2113 * all the way out to the cutoff, and that is it. For other forms
2114 * we need to calculate the constant shift up to the point where we
2115 * start modifying the potential.
2117 ri0 = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
2123 if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
2124 (fr->vdwtype == evdwSHIFT))
2126 /* Determine the constant energy shift below rvdw_switch.
2127 * Table has a scale factor since we have scaled it down to compensate
2128 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
2130 fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
2131 fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
2133 else if (fr->vdw_modifier == eintmodPOTSHIFT)
2135 fr->enershiftsix = (real)(-1.0/(rc3*rc3));
2136 fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
2139 /* Add the constant part from 0 to rvdw_switch.
2140 * This integration from 0 to rvdw_switch overcounts the number
2141 * of interactions by 1, as it also counts the self interaction.
2142 * We will correct for this later.
2144 eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
2145 eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
2147 /* Calculate the contribution in the range [r0,r1] where we
2148 * modify the potential. For a pure potential-shift modifier we will
2149 * have ri0==ri1, and there will not be any contribution here.
2151 for (i = 0; i < 2; i++)
2155 integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
2156 eners[i] -= enersum;
2160 /* Alright: Above we compensated by REMOVING the parts outside r0
2161 * corresponding to the ideal VdW 1/r6 and /r12 potentials.
2163 * Regardless of whether r0 is the point where we start switching,
2164 * or the cutoff where we calculated the constant shift, we include
2165 * all the parts we are missing out to infinity from r0 by
2166 * calculating the analytical dispersion correction.
2168 eners[0] += -4.0*M_PI/(3.0*rc3);
2169 eners[1] += 4.0*M_PI/(9.0*rc9);
2170 virs[0] += 8.0*M_PI/rc3;
2171 virs[1] += -16.0*M_PI/(3.0*rc9);
2173 else if (fr->vdwtype == evdwCUT ||
2174 EVDW_PME(fr->vdwtype) ||
2175 fr->vdwtype == evdwUSER)
2177 if (fr->vdwtype == evdwUSER && fplog)
2180 "WARNING: using dispersion correction with user tables\n");
2183 /* Note that with LJ-PME, the dispersion correction is multiplied
2184 * by the difference between the actual C6 and the value of C6
2185 * that would produce the combination rule.
2186 * This means the normal energy and virial difference formulas
2190 rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
2192 /* Contribution beyond the cut-off */
2193 eners[0] += -4.0*M_PI/(3.0*rc3);
2194 eners[1] += 4.0*M_PI/(9.0*rc9);
2195 if (fr->vdw_modifier == eintmodPOTSHIFT)
2197 /* Contribution within the cut-off */
2198 eners[0] += -4.0*M_PI/(3.0*rc3);
2199 eners[1] += 4.0*M_PI/(3.0*rc9);
2201 /* Contribution beyond the cut-off */
2202 virs[0] += 8.0*M_PI/rc3;
2203 virs[1] += -16.0*M_PI/(3.0*rc9);
2208 "Dispersion correction is not implemented for vdw-type = %s",
2209 evdw_names[fr->vdwtype]);
2212 /* When we deprecate the group kernels the code below can go too */
2213 if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
2215 /* Calculate self-interaction coefficient (assuming that
2216 * the reciprocal-space contribution is constant in the
2217 * region that contributes to the self-interaction).
2219 fr->enershiftsix = gmx::power6(fr->ewaldcoeff_lj) / 6.0;
2221 eners[0] += -gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj)/3.0;
2222 virs[0] += gmx::power3(std::sqrt(M_PI)*fr->ewaldcoeff_lj);
2225 fr->enerdiffsix = eners[0];
2226 fr->enerdifftwelve = eners[1];
2227 /* The 0.5 is due to the Gromacs definition of the virial */
2228 fr->virdiffsix = 0.5*virs[0];
2229 fr->virdifftwelve = 0.5*virs[1];
2233 void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
2234 matrix box, real lambda, tensor pres, tensor virial,
2235 real *prescorr, real *enercorr, real *dvdlcorr)
2237 gmx_bool bCorrAll, bCorrPres;
2238 real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
2248 if (ir->eDispCorr != edispcNO)
2250 bCorrAll = (ir->eDispCorr == edispcAllEner ||
2251 ir->eDispCorr == edispcAllEnerPres);
2252 bCorrPres = (ir->eDispCorr == edispcEnerPres ||
2253 ir->eDispCorr == edispcAllEnerPres);
2255 invvol = 1/det(box);
2258 /* Only correct for the interactions with the inserted molecule */
2259 dens = (fr->numAtomsForDispersionCorrection - fr->n_tpi)*invvol;
2264 dens = fr->numAtomsForDispersionCorrection*invvol;
2265 ninter = 0.5*fr->numAtomsForDispersionCorrection;
2268 if (ir->efep == efepNO)
2270 avcsix = fr->avcsix[0];
2271 avctwelve = fr->avctwelve[0];
2275 avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
2276 avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
2279 enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
2280 *enercorr += avcsix*enerdiff;
2282 if (ir->efep != efepNO)
2284 dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
2288 enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
2289 *enercorr += avctwelve*enerdiff;
2290 if (fr->efep != efepNO)
2292 dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
2298 svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
2299 if (ir->eDispCorr == edispcAllEnerPres)
2301 svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
2303 /* The factor 2 is because of the Gromacs virial definition */
2304 spres = -2.0*invvol*svir*PRESFAC;
2306 for (m = 0; m < DIM; m++)
2308 virial[m][m] += svir;
2309 pres[m][m] += spres;
2314 /* Can't currently control when it prints, for now, just print when degugging */
2319 fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
2325 "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
2326 *enercorr, spres, svir);
2330 fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
2334 if (fr->efep != efepNO)
2336 *dvdlcorr += dvdlambda;
2341 static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2342 const gmx_mtop_t *mtop, rvec x[],
2347 gmx_molblock_t *molb;
2349 if (bFirst && fplog)
2351 fprintf(fplog, "Removing pbc first time\n");
2356 for (mb = 0; mb < mtop->nmolblock; mb++)
2358 molb = &mtop->molblock[mb];
2359 if (molb->natoms_mol == 1 ||
2360 (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
2362 /* Just one atom or charge group in the molecule, no PBC required */
2363 as += molb->nmol*molb->natoms_mol;
2367 /* Pass NULL iso fplog to avoid graph prints for each molecule type */
2368 mk_graph_ilist(nullptr, mtop->moltype[molb->type].ilist,
2369 0, molb->natoms_mol, FALSE, FALSE, graph);
2371 for (mol = 0; mol < molb->nmol; mol++)
2373 mk_mshift(fplog, graph, ePBC, box, x+as);
2375 shift_self(graph, box, x+as);
2376 /* The molecule is whole now.
2377 * We don't need the second mk_mshift call as in do_pbc_first,
2378 * since we no longer need this graph.
2381 as += molb->natoms_mol;
2389 void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
2390 const gmx_mtop_t *mtop, rvec x[])
2392 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
2395 void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
2396 gmx_mtop_t *mtop, rvec x[])
2398 low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
2401 void put_atoms_in_box_omp(int ePBC, const matrix box, int natoms, rvec x[])
2404 nth = gmx_omp_nthreads_get(emntDefault);
2406 #pragma omp parallel for num_threads(nth) schedule(static)
2407 for (t = 0; t < nth; t++)
2413 offset = (natoms*t )/nth;
2414 len = (natoms*(t + 1))/nth - offset;
2415 put_atoms_in_box(ePBC, box, len, x + offset);
2417 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
2421 // TODO This can be cleaned up a lot, and move back to runner.cpp
2422 void finish_run(FILE *fplog, const gmx::MDLogger &mdlog, t_commrec *cr,
2423 const t_inputrec *inputrec,
2424 t_nrnb nrnb[], gmx_wallcycle_t wcycle,
2425 gmx_walltime_accounting_t walltime_accounting,
2426 nonbonded_verlet_t *nbv,
2427 gmx_bool bWriteStat)
2429 t_nrnb *nrnb_tot = nullptr;
2431 double nbfs = 0, mflop = 0;
2432 double elapsed_time,
2433 elapsed_time_over_all_ranks,
2434 elapsed_time_over_all_threads,
2435 elapsed_time_over_all_threads_over_all_ranks;
2436 /* Control whether it is valid to print a report. Only the
2437 simulation master may print, but it should not do so if the run
2438 terminated e.g. before a scheduled reset step. This is
2439 complicated by the fact that PME ranks are unaware of the
2440 reason why they were sent a pmerecvqxFINISH. To avoid
2441 communication deadlocks, we always do the communication for the
2442 report, even if we've decided not to write the report, because
2443 how long it takes to finish the run is not important when we've
2444 decided not to report on the simulation performance. */
2445 bool printReport = SIMMASTER(cr);
2447 if (!walltime_accounting_get_valid_finish(walltime_accounting))
2449 GMX_LOG(mdlog.warning).asParagraph().appendText("Simulation ended prematurely, no performance report will be written.");
2450 printReport = false;
2457 MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
2458 cr->mpi_comm_mysim);
2466 elapsed_time = walltime_accounting_get_elapsed_time(walltime_accounting);
2467 elapsed_time_over_all_ranks = elapsed_time;
2468 elapsed_time_over_all_threads = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
2469 elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
2473 /* reduce elapsed_time over all MPI ranks in the current simulation */
2474 MPI_Allreduce(&elapsed_time,
2475 &elapsed_time_over_all_ranks,
2476 1, MPI_DOUBLE, MPI_SUM,
2477 cr->mpi_comm_mysim);
2478 elapsed_time_over_all_ranks /= cr->nnodes;
2479 /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
2480 * current simulation. */
2481 MPI_Allreduce(&elapsed_time_over_all_threads,
2482 &elapsed_time_over_all_threads_over_all_ranks,
2483 1, MPI_DOUBLE, MPI_SUM,
2484 cr->mpi_comm_mysim);
2490 print_flop(fplog, nrnb_tot, &nbfs, &mflop);
2497 if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
2499 print_dd_statistics(cr, inputrec, fplog);
2502 /* TODO Move the responsibility for any scaling by thread counts
2503 * to the code that handled the thread region, so that there's a
2504 * mechanism to keep cycle counting working during the transition
2505 * to task parallelism. */
2506 int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
2507 int nthreads_pme = gmx_omp_nthreads_get(emntPME);
2508 wallcycle_scale_by_num_threads(wcycle, cr->duty == DUTY_PME, nthreads_pp, nthreads_pme);
2509 auto cycle_sum(wallcycle_sum(cr, wcycle));
2513 struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : nullptr;
2515 wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
2516 elapsed_time_over_all_ranks,
2517 wcycle, cycle_sum, gputimes);
2519 if (EI_DYNAMICS(inputrec->eI))
2521 delta_t = inputrec->delta_t;
2526 print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
2527 elapsed_time_over_all_ranks,
2528 walltime_accounting_get_nsteps_done(walltime_accounting),
2529 delta_t, nbfs, mflop);
2533 print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
2534 elapsed_time_over_all_ranks,
2535 walltime_accounting_get_nsteps_done(walltime_accounting),
2536 delta_t, nbfs, mflop);
2541 extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, gmx::ArrayRef<real> lambda, double *lam0)
2543 /* this function works, but could probably use a logic rewrite to keep all the different
2544 types of efep straight. */
2546 if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
2551 t_lambda *fep = ir->fepvals;
2552 *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
2553 if checkpoint is set -- a kludge is in for now
2556 for (int i = 0; i < efptNR; i++)
2558 /* overwrite lambda state with init_lambda for now for backwards compatibility */
2559 if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
2561 lambda[i] = fep->init_lambda;
2564 lam0[i] = lambda[i];
2569 lambda[i] = fep->all_lambda[i][*fep_state];
2572 lam0[i] = lambda[i];
2578 /* need to rescale control temperatures to match current state */
2579 for (int i = 0; i < ir->opts.ngtc; i++)
2581 if (ir->opts.ref_t[i] > 0)
2583 ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
2588 /* Send to the log the information on the current lambdas */
2589 if (fplog != nullptr)
2591 fprintf(fplog, "Initial vector of lambda components:[ ");
2592 for (const auto &l : lambda)
2594 fprintf(fplog, "%10.4f ", l);
2596 fprintf(fplog, "]\n");
2602 void init_md(FILE *fplog,
2603 t_commrec *cr, gmx::IMDOutputProvider *outputProvider,
2604 t_inputrec *ir, const gmx_output_env_t *oenv,
2605 double *t, double *t0,
2606 gmx::ArrayRef<real> lambda, int *fep_state, double *lam0,
2607 t_nrnb *nrnb, gmx_mtop_t *mtop,
2609 int nfile, const t_filenm fnm[],
2610 gmx_mdoutf_t *outf, t_mdebin **mdebin,
2611 tensor force_vir, tensor shake_vir, rvec mu_tot,
2612 gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
2613 gmx_wallcycle_t wcycle)
2617 /* Initial values */
2618 *t = *t0 = ir->init_t;
2621 for (i = 0; i < ir->opts.ngtc; i++)
2623 /* set bSimAnn if any group is being annealed */
2624 if (ir->opts.annealing[i] != eannNO)
2630 /* Initialize lambda variables */
2631 initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
2633 // TODO upd is never NULL in practice, but the analysers don't know that
2636 *upd = init_update(ir);
2640 update_annealing_target_temp(ir, ir->init_t, upd ? *upd : nullptr);
2645 *vcm = init_vcm(fplog, &mtop->groups, ir);
2648 if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
2650 if (ir->etc == etcBERENDSEN)
2652 please_cite(fplog, "Berendsen84a");
2654 if (ir->etc == etcVRESCALE)
2656 please_cite(fplog, "Bussi2007a");
2658 if (ir->eI == eiSD1)
2660 please_cite(fplog, "Goga2012");
2667 *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, outputProvider, ir, mtop, oenv, wcycle);
2669 *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? nullptr : mdoutf_get_fp_ene(*outf),
2670 mtop, ir, mdoutf_get_fp_dhdl(*outf));
2673 /* Initiate variables */
2674 clear_mat(force_vir);
2675 clear_mat(shake_vir);