*/
#include "gmxpre.h"
+#include "gromacs/legacyheaders/sim_util.h"
+
#include "config.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
+
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
-#include "gromacs/legacyheaders/typedefs.h"
-#include "gromacs/utility/cstringutil.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/txtdump.h"
-#include "gromacs/pbcutil/pbc.h"
-#include "gromacs/legacyheaders/chargegroup.h"
-#include "gromacs/math/vec.h"
-#include "gromacs/legacyheaders/nrnb.h"
-#include "gromacs/legacyheaders/mdrun.h"
-#include "gromacs/legacyheaders/sim_util.h"
-#include "gromacs/legacyheaders/update.h"
-#include "gromacs/math/units.h"
-#include "gromacs/legacyheaders/mdatoms.h"
-#include "gromacs/legacyheaders/force.h"
-#include "gromacs/legacyheaders/bondf.h"
-#include "gromacs/legacyheaders/pme.h"
-#include "gromacs/legacyheaders/disre.h"
-#include "gromacs/legacyheaders/orires.h"
-#include "gromacs/legacyheaders/network.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
+#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+#include "gromacs/imd/imd.h"
#include "gromacs/legacyheaders/calcmu.h"
+#include "gromacs/legacyheaders/chargegroup.h"
#include "gromacs/legacyheaders/constr.h"
#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/disre.h"
#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
#include "gromacs/legacyheaders/genborn.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_search.h"
-#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
-#include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
-#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
#include "gromacs/legacyheaders/nonbonded.h"
-#include "../gmxlib/nonbonded/nb_kernel.h"
-#include "../gmxlib/nonbonded/nb_free_energy.h"
-
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/orires.h"
+#include "gromacs/legacyheaders/pme.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/update.h"
#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/listed-forces/bonded.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/smalloc.h"
-#include "gromacs/essentialdynamics/edsam.h"
-#include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/imd/imd.h"
-#include "adress.h"
-#include "gromacs/legacyheaders/qmmm.h"
-
-#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
-
-#include "nbnxn_cuda/nbnxn_cuda.h"
-#include "nb_verlet.h"
+#include "adress.h"
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
t_mdatoms *mdatoms,
gmx_enerdata_t *enerd,
real *lambda,
- double t)
+ double t,
+ gmx_wallcycle_t wcycle)
{
t_pbc pbc;
real dvdl;
* The virial contribution is calculated directly,
* which is why we call pull_potential after calc_virial.
*/
+ wallcycle_start(wcycle, ewcPULLPOT);
set_pbc(&pbc, ir->ePBC, box);
dvdl = 0;
enerd->term[F_COM_PULL] +=
pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+ wallcycle_stop(wcycle, ewcPULLPOT);
}
static void pme_receive_force_ener(t_commrec *cr,
}
/* We calculate the non-bonded forces, when done on the CPU, here.
- * We do this before calling do_force_lowlevel, as in there bondeds
- * forces are calculated before PME, which does communication.
- * With this order, non-bonded and bonded force calculation imbalance
- * can be balanced out by the domain decomposition load balancing.
+ * We do this before calling do_force_lowlevel, because in that
+ * function, the listed forces are calculated before PME, which
+ * does communication. With this order, non-bonded and listed
+ * force calculation imbalance can be balanced out by the domain
+ * decomposition load balancing.
*/
if (!bUseOrEmulGPU)
update_QMMMrec(cr, fr, x, mdatoms, box, top);
}
- if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
{
posres_wrapper(flags, inputrec, nrnb, top, box, x,
enerd, lambda, fr);
}
- if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
{
fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
}
do_force_lowlevel(fr, inputrec, &(top->idef),
cr, nrnb, wcycle, mdatoms,
x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- &(top->atomtypes), bBornRadii, box,
+ bBornRadii, box,
inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
flags, &cycles_pme);
if (bDoForces && DOMAINDECOMP(cr))
{
+ if (bUseGPU)
+ {
+ /* We are done with the CPU compute, but the GPU local non-bonded
+ * kernel can still be running while we communicate the forces.
+ * We start a counter here, so we can, hopefully, time the rest
+ * of the GPU kernel execution and data transfer.
+ */
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+ }
+
/* Communicate the forces */
wallcycle_start(wcycle, ewcMOVEF);
dd_move_f(cr->dd, f, fr->fshift);
/* wait for local forces (or calculate in emulation mode) */
if (bUseGPU)
{
+ float cycles_tmp, cycles_wait_est;
+ const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
nbnxn_cuda_wait_gpu(nbv->cu_nbv,
nbv->grp[eintLocal].nbat,
flags, eatLocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ if (bDoForces && DOMAINDECOMP(cr))
+ {
+ cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+ if (cycles_tmp < cuda_api_overhead_margin)
+ {
+ /* We measured few cycles, it could be that the kernel
+ * and transfer finished earlier and there was no actual
+ * wait time, only API call overhead.
+ * Then the actual time could be anywhere between 0 and
+ * cycles_wait_est. As a compromise, we use half the time.
+ */
+ cycles_wait_est *= 0.5f;
+ }
+ }
+ else
+ {
+ /* No force communication so we actually timed the wait */
+ cycles_wait_est = cycles_tmp;
+ }
+ /* Even though this is after dd_move_f, the actual task we are
+ * waiting for runs asynchronously with dd_move_f and we usually
+ * have nothing to balance it with, so we can and should add
+ * the time to the force time for load balancing.
+ */
+ cycles_force += cycles_wait_est;
+ cycles_wait_gpu += cycles_wait_est;
/* now clear the GPU outputs while we finish the step on the CPU */
if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
+ /* Since the COM pulling is always done mass-weighted, no forces are
+ * applied to vsites and this call can be done after vsite spreading.
+ */
pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t);
+ f, vir_force, mdatoms, enerd, lambda, t,
+ wcycle);
}
/* Add the forces from enforced rotation potentials (if any) */
update_QMMMrec(cr, fr, x, mdatoms, box, top);
}
- if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_POSRES].nr > 0)
{
posres_wrapper(flags, inputrec, nrnb, top, box, x,
enerd, lambda, fr);
}
- if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ if ((flags & GMX_FORCE_LISTED) && top->idef.il[F_FBPOSRES].nr > 0)
{
fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
}
do_force_lowlevel(fr, inputrec, &(top->idef),
cr, nrnb, wcycle, mdatoms,
x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
- &(top->atomtypes), bBornRadii, box,
+ bBornRadii, box,
inputrec->fepvals, lambda,
graph, &(top->excls), fr->mu_tot,
flags,
if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t);
+ f, vir_force, mdatoms, enerd, lambda, t,
+ wcycle);
}
/* Add the forces from enforced rotation potentials (if any) */
int nfile, const t_filenm fnm[],
gmx_mdoutf_t *outf, t_mdebin **mdebin,
tensor force_vir, tensor shake_vir, rvec mu_tot,
- gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
+ gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
+ gmx_wallcycle_t wcycle)
{
int i, j, n;
real tmpt, mod;
if (nfile != -1)
{
- *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv);
+ *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
*mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
mtop, ir, mdoutf_get_fp_dhdl(*outf));