static void do_nb_verlet(t_forcerec *fr,
const interaction_const_t *ic,
gmx_enerdata_t *enerd,
- const int flags,
+ int legacyForceFlags,
+ const gmx::ForceFlags &forceFlags,
const Nbnxm::InteractionLocality ilocality,
const int clearF,
const int64_t step,
t_nrnb *nrnb,
gmx_wallcycle_t wcycle)
{
- if (!(flags & GMX_FORCE_NONBONDED))
+ if (!(legacyForceFlags & GMX_FORCE_NONBONDED))
{
/* skip non-bonded calculation */
return;
}
}
- nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
+ nbv->dispatchNonbondedKernel(ilocality, *ic, legacyForceFlags, forceFlags, clearF, *fr, enerd, nrnb);
}
static inline void clear_rvecs_omp(int n, rvec v[])
* \param[in,out] pmedata PME module data
* \param[in,out] forceOutputs Output buffer for the forces and virial
* \param[in,out] enerd Energy data structure results are reduced into
- * \param[in] flags Force flags
+ * \param[in] forceFlags Force schedule flags
* \param[in] pmeFlags PME flags
* \param[in] wcycle The wallcycle structure
*/
gmx_pme_t *pmedata,
gmx::ForceOutputs *forceOutputs,
gmx_enerdata_t *enerd,
- int flags,
+ const gmx::ForceFlags &forceFlags,
int pmeFlags,
gmx_wallcycle_t wcycle)
{
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
- flags, // FIXME remove this
+ forceFlags,
Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
/* launch local nonbonded work on GPU */
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
/* launch non-local nonbonded tasks on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- // FIXME
- flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+ forceFlags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- // FIXME
- flags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+ forceFlags, Nbnxm::AtomLocality::Local, copyBackNbForce);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (forceWork.haveGpuBondedWork && forceFlags.computeEnergy)
if (!bUseOrEmulGPU)
{
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
step, nrnb, wcycle);
}
{
if (havePPDomainDecomposition(cr))
{
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
}
if (bUseGPU)
{
cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- flags, Nbnxm::AtomLocality::NonLocal,
+ forceFlags, Nbnxm::AtomLocality::NonLocal,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
forceWithShiftForces.shiftForces(),
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
if (alternateGpuWait)
{
alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd,
- flags, pmeFlags, wcycle);
+ forceFlags, pmeFlags, wcycle);
}
if (!alternateGpuWait && useGpuPme)
const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
const float waitCycles =
Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- flags, Nbnxm::AtomLocality::Local,
+ forceFlags, Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
forceOut.forceWithShiftForces().shiftForces(),
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local,
+ do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
with this event in the non-local stream before launching the non-bonded kernel.
*/
void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
- const int flags,
+ const gmx::ForceFlags &forceFlags,
const InteractionLocality iloc)
{
cu_atomdata_t *adat = nb->atdat;
cu_timers_t *t = nb->timers;
cudaStream_t stream = nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
bool bDoTime = nb->bDoTime;
/* Don't launch the non-local kernel if there is no work to do.
auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
const auto kernel = select_nbnxn_kernel(nbp->eeltype,
nbp->vdwtype,
- bCalcEner,
+ forceFlags.computeEnergy,
(plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
nb->dev_info);
- const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &bCalcFshift);
+ const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &forceFlags.computeVirial);
launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs);
if (bDoTime)
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
- const int flags,
+ const gmx::ForceFlags &forceFlags,
const AtomLocality atomLocality,
const bool copyBackNbForce)
{
bool bDoTime = nb->bDoTime;
cudaStream_t stream = nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
-
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
/* only transfer energies in the local stream */
if (iloc == InteractionLocality::Local)
{
- /* DtoH fshift */
- if (bCalcFshift)
+ /* DtoH fshift when virial is needed */
+ if (forceFlags.computeVirial)
{
cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
SHIFTS * sizeof(*nb->nbst.fshift), stream);
}
/* DtoH energies */
- if (bCalcEner)
+ if (forceFlags.computeEnergy)
{
cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
sizeof(*nb->nbst.e_lj), stream);
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/listed_forces/gpubonded.h"
#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
//TODO: move into shared source file with gmx_compile_cpp_as_cuda
//NOLINTNEXTLINE(misc-definitions-in-headers)
bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
- const int flags,
+ const gmx::ForceFlags &forceFlags,
const AtomLocality aloc,
real *e_lj,
real *e_el,
gpuStreamSynchronize(nb->stream[iLocality]);
}
- bool calcEner = (flags & GMX_FORCE_ENERGY) != 0;
- bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
-
- gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
+ gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, forceFlags.computeEnergy,
nb->bDoTime != 0);
- gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift,
+ gpu_reduce_staged_outputs(nb->nbst, iLocality, forceFlags.computeEnergy, forceFlags.computeVirial,
e_lj, e_el, as_rvec_array(shiftForces.data()));
}
* pruning flags.
*
* \param[in] nb The nonbonded data GPU structure
- * \param[in] flags Force flags
+ * \param[in] forceFlags Force schedule flags
* \param[in] aloc Atom locality identifier
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
*/
//NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
float gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
- int flags,
+ const gmx::ForceFlags &forceFlags,
AtomLocality aloc,
real *e_lj,
real *e_el,
(gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
wallcycle_start(wcycle, cycleCounter);
- gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
+ gpu_try_finish_task(nb, forceFlags, aloc, e_lj, e_el, shiftForces,
GpuTaskCompletion::Wait, wcycle);
float waitTime = wallcycle_stop(wcycle, cycleCounter);
void
nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality,
const interaction_const_t &ic,
- int forceFlags,
+ int legacyForceFlags,
+ const gmx::ForceFlags &forceFlags,
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
nbat.get(),
ic,
fr.shift_vec,
- forceFlags,
+ legacyForceFlags,
clearF,
enerd->grpp.ener[egCOULSR].data(),
fr.bBHAM ?
nbnxn_kernel_gpu_ref(pairlistSet.gpuList(),
nbat.get(), &ic,
fr.shift_vec,
- forceFlags,
+ legacyForceFlags,
clearF,
nbat->out[0].f,
nbat->out[0].fshift.data(),
}
- accountFlops(nrnb, pairlistSet, *this, ic, forceFlags);
+ accountFlops(nrnb, pairlistSet, *this, ic, legacyForceFlags);
}
void
//! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU
void dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality,
const interaction_const_t &ic,
- int forceFlags,
+ int legacyForceFlags,
+ const gmx::ForceFlags &forceFlags,
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
namespace gmx
{
class GpuBonded;
+class ForceFlags;
}
namespace Nbnxm
*
*/
GPU_FUNC_QUALIFIER
-void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- InteractionLocality gmx_unused iloc) GPU_FUNC_TERM;
+void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const gmx::ForceFlags gmx_unused &forceFlags,
+ InteractionLocality gmx_unused iloc) GPU_FUNC_TERM;
/*! \brief
* Launch asynchronously the nonbonded prune-only kernel.
* (and energies/shift forces if required).
*/
GPU_FUNC_QUALIFIER
-void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
- nbnxn_atomdata_t gmx_unused *nbatom,
- int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- bool gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
+ nbnxn_atomdata_t gmx_unused *nbatom,
+ const gmx::ForceFlags gmx_unused &forceFlags,
+ AtomLocality gmx_unused aloc,
+ bool gmx_unused copyBackNbForce) GPU_FUNC_TERM;
/*! \brief Attempts to complete nonbonded GPU task.
*
* force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
* the energy and Fshift contributions for some external/centralized reduction.
*
- * \param[in] nb The nonbonded data GPU structure
- * \param[in] flags Force flags
- * \param[in] aloc Atom locality identifier
- * \param[out] e_lj Pointer to the LJ energy output to accumulate into
- * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] forceFlags Force schedule flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] shiftForces Shift forces buffer to accumulate into
* \param[in] completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
- * \param[out] wcycle Pointer to wallcycle data structure
- * \returns True if the nonbonded tasks associated with \p aloc locality have completed
+ * \param[out] wcycle Pointer to wallcycle data structure
+ * \returns True if the nonbonded tasks associated with \p aloc locality have completed
*/
GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const gmx::ForceFlags gmx_unused &forceFlags,
+ AtomLocality gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
- GpuTaskCompletion gmx_unused completionKind,
- gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
+ GpuTaskCompletion gmx_unused completionKind,
+ gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
/*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data
* transfers to finish.
* pruning flags.
*
* \param[in] nb The nonbonded data GPU structure
- * \param[in] flags Force flags
+ * \param[in] forceFlags Force schedule flags
* \param[in] aloc Atom locality identifier
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
*/
GPU_FUNC_QUALIFIER
float gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
- gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
+ const gmx::ForceFlags gmx_unused &forceFlags,
+ AtomLocality gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
+ gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
GPU_FUNC_QUALIFIER
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/hardware/hw_info.h"
-#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_common.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
are finished and synchronize with this event in the non-local stream.
*/
void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
- const int flags,
+ const gmx::ForceFlags &forceFlags,
const Nbnxm::InteractionLocality iloc)
{
cl_atomdata_t *adat = nb->atdat;
cl_timers_t *t = nb->timers;
cl_command_queue stream = nb->stream[iloc];
- bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0;
- int bCalcFshift = flags & GMX_FORCE_VIRIAL;
bool bDoTime = (nb->bDoTime) != 0;
cl_nbparam_params_t nbparams_params;
const auto kernel = select_nbnxn_kernel(nb,
nbp->eeltype,
nbp->vdwtype,
- bCalcEner,
+ forceFlags.computeEnergy,
(plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
+ // The OpenCL kernel takes int as second to last argument because bool is
+ // not supported as a kernel argument type (sizeof(bool) is implementation defined).
+ const int computeFshift = forceFlags.computeVirial;
if (useLjCombRule(nb->nbparam->vdwtype))
{
const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
&nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
&adat->lj_comb,
&adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
- &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+ &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
}
&nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
&adat->atom_types,
&adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
- &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+ &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
}
*/
void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
struct nbnxn_atomdata_t *nbatom,
- const int flags,
+ const gmx::ForceFlags &forceFlags,
const AtomLocality aloc,
- const bool gmx_unused copyBackNbForce)
+ const bool gmx_unused copyBackNbForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
bool bDoTime = nb->bDoTime == CL_TRUE;
cl_command_queue stream = nb->stream[iloc];
- bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0;
- int bCalcFshift = flags & GMX_FORCE_VIRIAL;
-
-
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
/* only transfer energies in the local stream */
if (iloc == InteractionLocality::Local)
{
- /* DtoH fshift */
- if (bCalcFshift)
+ /* DtoH fshift when virial is needed */
+ if (forceFlags.computeVirial)
{
ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
- if (bCalcEner)
+ if (forceFlags.computeEnergy)
{
ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);