// TODO should wrap with ewcLAUNCH_GPU
GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
+ // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
float *h_vTot = vTot_.data();
copyFromDeviceBuffer(h_vTot, &d_vTot_,
0, F_NRE,
* \param[in,out] enerd Energy data structure results are reduced into
* \param[in] flags Force flags
* \param[in] pmeFlags PME flags
- * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s)
* \param[in] wcycle The wallcycle structure
*/
static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
gmx_enerdata_t *enerd,
int flags,
int pmeFlags,
- bool haveOtherWork,
gmx_wallcycle_t wcycle)
{
bool isPmeGpuDone = false;
isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
flags,
Nbnxm::AtomLocality::Local,
- haveOtherWork,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
fshift, completionType);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
&top->excls, step, nrnb);
+
+ nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::Local);
+
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
wallcycle_stop(wcycle, ewcNS);
if (bNS || !useGpuXBufOps)
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
- Nbnxm::AtomLocality::Local,
- ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::AtomLocality::Local);
}
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
// with X buffer ops offloaded to the GPU on all but the search steps
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
&top->excls, step, nrnb);
+
+ nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
}
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
- Nbnxm::AtomLocality::NonLocal,
- ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::AtomLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
}
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
+ flags, Nbnxm::AtomLocality::NonLocal);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
+ flags, Nbnxm::AtomLocality::Local);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
flags, Nbnxm::AtomLocality::NonLocal,
- ppForceWorkload->haveGpuBondedWork,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
fr->fshift);
if (alternateGpuWait)
{
alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd,
- flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
+ flags, pmeFlags, wcycle);
}
if (!alternateGpuWait && useGpuPme)
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+ flags, Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
fr->fshift);
/*! \brief Launch asynchronously the xq buffer host to device copy. */
void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb,
const nbnxn_atomdata_t *nbatom,
- const AtomLocality atomLocality,
- const bool haveOtherWork)
+ const AtomLocality atomLocality)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
"Only local and non-local xq transfers are supported");
we always call the local local x+q copy (and the rest of the local
work in nbnxn_gpu_launch_kernel().
*/
- if (!haveOtherWork && canSkipWork(*nb, iloc))
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
plist->haveFreshList = false;
clearing. All these operations, except for the local interaction kernel,
are needed for the non-local interactions. The skip of the local kernel
call is taken care of later in this function. */
- if (canSkipWork(*nb, iloc))
+ if (canSkipNonbondedWork(*nb, iloc))
{
plist->haveFreshList = false;
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
const int flags,
- const AtomLocality atomLocality,
- const bool haveOtherWork)
+ const AtomLocality atomLocality)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch non-local copy-back if there was no non-local work to do */
- if (!haveOtherWork && canSkipWork(*nb, iloc))
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
return;
}
initialization in local stream that is required also
by nonlocal stream ) */
+ //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+ // to be executed in the current domain. As long as bonded work is not split up into
+ // local/nonlocal, if there is bonded GPU work, both flags will be true.
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
* Timer init/uninit is still done even with timing off so only the condition
#endif
#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/listed_forces/gpubonded.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gpu_common_utils.h"
#include "nbnxm_gpu.h"
+namespace gmx
+{
+class GpuBonded;
+}
+
namespace Nbnxm
{
}
}
+
+void
+setupGpuShortRangeWork(gmx_nbnxn_gpu_t *nb,
+ const gmx::GpuBonded *gpuBonded,
+ const Nbnxm::InteractionLocality iLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ // There is short-range work if the pair list for the provided
+ // interaction locality contains entries or if there is any
+ // bonded work (as this is not split into local/nonlocal).
+ nb->haveWork[iLocality] =
+ ((nb->plist[iLocality]->nsci != 0) ||
+ (gpuBonded != nullptr && gpuBonded->haveInteractions()));
+}
+
+/*! \brief Returns true if there is GPU short-range work for the given interaction locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for all interaction localities.
+ *
+ * \param[inout] nb Pointer to the nonbonded GPU data structure
+ * \param[in] iLocality Interaction locality identifier
+ */
+static bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t &nb,
+ const Nbnxm::InteractionLocality iLocality)
+{
+ return nb.haveWork[iLocality];
+}
+
+bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t *nb,
+ const Nbnxm::AtomLocality aLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
+}
+
+
+
/*! \brief Calculate atom range and return start index and length.
*
* \param[in] atomData Atom descriptor data structure
bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
const int flags,
const AtomLocality aloc,
- const bool haveOtherWork,
real *e_lj,
real *e_el,
rvec *fshift,
const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
// We skip when during the non-local phase there was actually no work to do.
- // This is consistent with nbnxn_gpu_launch_kernel.
- if (haveOtherWork || !canSkipWork(*nb, iLocality))
+ // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
+ // bonded GPU work.
+ if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
{
// Query the state of the GPU stream and return early if we're not done
if (completionKind == GpuTaskCompletion::Check)
* \param[in] nb The nonbonded data GPU structure
* \param[in] flags Force flags
* \param[in] aloc Atom locality identifier
- * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s)
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] fshift Pointer to the shift force buffer to accumulate into
void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
int flags,
AtomLocality aloc,
- bool haveOtherWork,
real *e_lj,
real *e_el,
rvec *fshift)
{
- gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+ gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
GpuTaskCompletion::Wait);
}
* local part of the force array also depends on the non-local kernel.
* The skip of the local kernel is taken care of separately.
*/
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
- InteractionLocality iloc)
+static inline bool canSkipNonbondedWork(const gmx_nbnxn_gpu_t &nb,
+ InteractionLocality iloc)
{
assert(nb.plist[iloc]);
return (iloc == InteractionLocality::NonLocal &&
rvec *f,
gmx_wallcycle *wcycle)
{
+ /* Skip the reduction if there was no short-range GPU work to do
+ * (either NB or both NB and bonded work). */
+ if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+ {
+ return;
+ }
+
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
void changePairlistRadii(real rlistOuter,
real rlistInner);
+ //! Set up internal flags that indicate what type of short-range work there is.
+ void setupGpuShortRangeWork(const gmx::GpuBonded *gpuBonded,
+ const Nbnxm::InteractionLocality iLocality)
+ {
+ if (useGpu() && !emulateGpu())
+ {
+ Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality);
+ }
+ }
+
+ //! Returns true if there is GPU short-range work for the given atom locality.
+ bool haveGpuShortRangeWork(const Nbnxm::AtomLocality aLocality)
+ {
+ return ((useGpu() && !emulateGpu()) &&
+ Nbnxm::haveGpuShortRangeWork(gpu_nbv, aLocality));
+ }
+
// TODO: Make all data members private
public:
//! All data related to the pair lists
struct nbnxn_atomdata_t;
enum class GpuTaskCompletion;
+namespace gmx
+{
+class GpuBonded;
+}
+
namespace Nbnxm
{
* \param [in] nb GPU nonbonded data.
* \param [in] nbdata Host-side atom data structure.
* \param [in] aloc Atom locality flag.
- * \param [in] haveOtherWork True if there are other tasks that require the nbnxn coordinate input.
*/
GPU_FUNC_QUALIFIER
void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
const struct nbnxn_atomdata_t gmx_unused *nbdata,
- AtomLocality gmx_unused aloc,
- bool gmx_unused haveOtherWork) GPU_FUNC_TERM
+ AtomLocality gmx_unused aloc) GPU_FUNC_TERM
/*! \brief
* Launch asynchronously the nonbonded force calculations.
int gmx_unused numParts) GPU_FUNC_TERM
/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
+ * Launch asynchronously the download of short-range forces from the GPU
* (and energies/shift forces if required).
- * When haveOtherWork=true, the copy-back is done even when there was
- * no non-bonded work.
*/
GPU_FUNC_QUALIFIER
void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
nbnxn_atomdata_t gmx_unused *nbatom,
int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- bool gmx_unused haveOtherWork) GPU_FUNC_TERM
+ AtomLocality gmx_unused aloc) GPU_FUNC_TERM
/*! \brief Attempts to complete nonbonded GPU task.
*
* \param[in] nb The nonbonded data GPU structure
* \param[in] flags Force flags
* \param[in] aloc Atom locality identifier
- * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s)
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] fshift Pointer to the shift force buffer to accumulate into
bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
int gmx_unused flags,
AtomLocality gmx_unused aloc,
- bool gmx_unused haveOtherWork,
real gmx_unused *e_lj,
real gmx_unused *e_el,
rvec gmx_unused *fshift,
* \param[in] nb The nonbonded data GPU structure
* \param[in] flags Force flags
* \param[in] aloc Atom locality identifier
- * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s)
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] fshift Pointer to the shift force buffer to accumulate into
void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
int gmx_unused flags,
AtomLocality gmx_unused aloc,
- bool gmx_unused haveOtherWork,
real gmx_unused *e_lj,
real gmx_unused *e_el,
rvec gmx_unused *fshift) GPU_FUNC_TERM
void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused *nb,
const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
+/*! \brief Set up internal flags that indicate what type of short-range work there is.
+ *
+ * As nonbondeds and bondeds share input/output buffers and GPU queues,
+ * both are considered when checking for work in the current domain.
+ *
+ * This function is expected to be called every time the work-distribution
+ * can change (i.e. at search/domain decomposition steps).
+ *
+ * \param[inout] nb Pointer to the nonbonded GPU data structure
+ * \param[in] gpuBonded Pointer to the GPU bonded data structure
+ * \param[in] iLocality Interaction locality identifier
+ */
+GPU_FUNC_QUALIFIER
+void setupGpuShortRangeWork(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const gmx::GpuBonded gmx_unused *gpuBonded,
+ const Nbnxm::InteractionLocality gmx_unused iLocality) GPU_FUNC_TERM
+
+/*! \brief Returns true if there is GPU short-range work for the given atom locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for both local and nonlocal atom range.
+ *
+ * \param[inout] nb Pointer to the nonbonded GPU data structure
+ * \param[in] aLocality Atom locality identifier
+ */
+GPU_FUNC_QUALIFIER
+bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused *nb,
+ const Nbnxm::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false)
+
+
} // namespace Nbnxm
#endif
/*! \brief Launch asynchronously the xq buffer host to device copy. */
void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb,
const nbnxn_atomdata_t *nbatom,
- const AtomLocality atomLocality,
- const bool haveOtherWork)
+ const AtomLocality atomLocality)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
/* local/nonlocal offset and length used for xq and f */
we always call the local local x+q copy (and the rest of the local
work in nbnxn_gpu_launch_kernel().
*/
- if (!haveOtherWork && canSkipWork(*nb, iloc))
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
plist->haveFreshList = false;
clearing. All these operations, except for the local interaction kernel,
are needed for the non-local interactions. The skip of the local kernel
call is taken care of later in this function. */
- if (canSkipWork(*nb, iloc))
+ if (canSkipNonbondedWork(*nb, iloc))
{
plist->haveFreshList = false;
void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
struct nbnxn_atomdata_t *nbatom,
const int flags,
- const AtomLocality aloc,
- const bool haveOtherWork)
+ const AtomLocality aloc)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
cl_int gmx_unused cl_error;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* don't launch non-local copy-back if there was no non-local work to do */
- if (!haveOtherWork && canSkipWork(*nb, iloc))
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
/* TODO An alternative way to signal that non-local work is
complete is to use a clEnqueueMarker+clEnqueueBarrier
non-local force calculations are done
(e.g. f buffer 0-ing, local x/q H2D) */
+ //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+ // to be executed in the current domain. As long as bonded work is not split up into
+ // local/nonlocal, if there is bonded GPU work, both flags will be true.
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
cl_bool bDoTime; /**< True if event-based timing is enabled. */
cl_timers_t *timers; /**< OpenCL event-based timers. */
struct gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */