From 19ceb46678f66831acca3359f43a6be3241893ad Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Thu, 27 Jun 2019 19:53:08 +0200 Subject: [PATCH] Refactor tracking of GPU short-range work/skipping This change introduces a set of flags that, for each interaction locality, whether there are short-range interactions computed and exposes a query in the nonbonded module's API. This allows consistent checks for both when work has been done and whether results need to be reduced. Refs #2986 Change-Id: I15020d83f73a132d9b8e93d7339529176396089a --- src/gromacs/listed_forces/gpubonded_impl.cu | 1 + src/gromacs/mdlib/sim_util.cpp | 23 ++++---- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 16 +++--- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 6 +++ src/gromacs/nbnxm/gpu_common.h | 59 ++++++++++++++++++--- src/gromacs/nbnxm/gpu_common_utils.h | 4 +- src/gromacs/nbnxm/nbnxm.cpp | 7 +++ src/gromacs/nbnxm/nbnxm.h | 17 ++++++ src/gromacs/nbnxm/nbnxm_gpu.h | 51 +++++++++++++----- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 16 +++--- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 6 +++ 11 files changed, 160 insertions(+), 46 deletions(-) diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index 065e88e253..d269c6918b 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -266,6 +266,7 @@ GpuBonded::Impl::launchEnergyTransfer() // TODO should wrap with ewcLAUNCH_GPU GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called"); + // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage) float *h_vTot = vTot_.data(); copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index ad61dee88a..5323c187a9 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -646,7 +646,6 @@ static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata, * \param[in,out] enerd Energy data structure results are reduced into * \param[in] flags Force flags * \param[in] pmeFlags PME flags - * \param[in] haveOtherWork Tells whether there is other work than non-bonded in the stream(s) * \param[in] wcycle The wallcycle structure */ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv, @@ -657,7 +656,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv gmx_enerdata_t *enerd, int flags, int pmeFlags, - bool haveOtherWork, gmx_wallcycle_t wcycle) { bool isPmeGpuDone = false; @@ -681,7 +679,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv, flags, Nbnxm::AtomLocality::Local, - haveOtherWork, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), fshift, completionType); @@ -1035,6 +1032,9 @@ void do_force(FILE *fplog, /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ nbv->constructPairlist(Nbnxm::InteractionLocality::Local, &top->excls, step, nrnb); + + nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::Local); + wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL); wallcycle_stop(wcycle, ewcNS); @@ -1061,8 +1061,7 @@ void do_force(FILE *fplog, if (bNS || !useGpuXBufOps) { Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), - Nbnxm::AtomLocality::Local, - ppForceWorkload->haveGpuBondedWork); + Nbnxm::AtomLocality::Local); } wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); // with X buffer ops offloaded to the GPU on all but the search steps @@ -1105,6 +1104,8 @@ void do_force(FILE *fplog, /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal, &top->excls, step, nrnb); + + nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); wallcycle_stop(wcycle, ewcNS); } @@ -1125,8 +1126,7 @@ void do_force(FILE *fplog, { wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), - Nbnxm::AtomLocality::NonLocal, - ppForceWorkload->haveGpuBondedWork); + Nbnxm::AtomLocality::NonLocal); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); } @@ -1155,10 +1155,10 @@ void do_force(FILE *fplog, if (havePPDomainDecomposition(cr)) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork); + flags, Nbnxm::AtomLocality::NonLocal); } Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork); + flags, Nbnxm::AtomLocality::Local); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY)) @@ -1323,7 +1323,6 @@ void do_force(FILE *fplog, wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL); Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, flags, Nbnxm::AtomLocality::NonLocal, - ppForceWorkload->haveGpuBondedWork, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), fr->fshift); @@ -1369,7 +1368,7 @@ void do_force(FILE *fplog, if (alternateGpuWait) { alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd, - flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle); + flags, pmeFlags, wcycle); } if (!alternateGpuWait && useGpuPme) @@ -1389,7 +1388,7 @@ void do_force(FILE *fplog, wallcycle_start(wcycle, ewcWAIT_GPU_NB_L); Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, - flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork, + flags, Nbnxm::AtomLocality::Local, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), fr->fshift); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index a0117a5cb1..e8e6c5b5cc 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -309,9 +309,10 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb, /*! \brief Launch asynchronously the xq buffer host to device copy. */ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, const nbnxn_atomdata_t *nbatom, - const AtomLocality atomLocality, - const bool haveOtherWork) + const AtomLocality atomLocality) { + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, "Only local and non-local xq transfers are supported"); @@ -335,7 +336,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, we always call the local local x+q copy (and the rest of the local work in nbnxn_gpu_launch_kernel(). */ - if (!haveOtherWork && canSkipWork(*nb, iloc)) + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { plist->haveFreshList = false; @@ -418,7 +419,7 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ - if (canSkipWork(*nb, iloc)) + if (canSkipNonbondedWork(*nb, iloc)) { plist->haveFreshList = false; @@ -639,9 +640,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, nbnxn_atomdata_t *nbatom, const int flags, - const AtomLocality atomLocality, - const bool haveOtherWork) + const AtomLocality atomLocality) { + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + cudaError_t stat; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ @@ -658,7 +660,7 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, bool bCalcFshift = flags & GMX_FORCE_VIRIAL; /* don't launch non-local copy-back if there was no non-local work to do */ - if (!haveOtherWork && canSkipWork(*nb, iloc)) + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { return; } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 48c8776c79..ff8705df59 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -260,6 +260,12 @@ struct gmx_nbnxn_cuda_t initialization in local stream that is required also by nonlocal stream ) */ + //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled + // to be executed in the current domain. As long as bonded work is not split up into + // local/nonlocal, if there is bonded GPU work, both flags will be true. + gmx::EnumerationArray haveWork; + + /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. * Timer init/uninit is still done even with timing off so only the condition diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 1624c56c8c..4f4edddbeb 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -56,6 +56,7 @@ #endif #include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/listed_forces/gpubonded.h" #include "gromacs/math/vec.h" #include "gromacs/mdlib/force_flags.h" #include "gromacs/nbnxm/nbnxm.h" @@ -67,6 +68,11 @@ #include "gpu_common_utils.h" #include "nbnxm_gpu.h" +namespace gmx +{ +class GpuBonded; +} + namespace Nbnxm { @@ -117,6 +123,49 @@ gpuAtomToInteractionLocality(const AtomLocality atomLocality) } } + +void +setupGpuShortRangeWork(gmx_nbnxn_gpu_t *nb, + const gmx::GpuBonded *gpuBonded, + const Nbnxm::InteractionLocality iLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + // There is short-range work if the pair list for the provided + // interaction locality contains entries or if there is any + // bonded work (as this is not split into local/nonlocal). + nb->haveWork[iLocality] = + ((nb->plist[iLocality]->nsci != 0) || + (gpuBonded != nullptr && gpuBonded->haveInteractions())); +} + +/*! \brief Returns true if there is GPU short-range work for the given interaction locality. + * + * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, + * and therefore if there are GPU offloaded bonded interactions, this function will return + * true for all interaction localities. + * + * \param[inout] nb Pointer to the nonbonded GPU data structure + * \param[in] iLocality Interaction locality identifier + */ +static bool +haveGpuShortRangeWork(const gmx_nbnxn_gpu_t &nb, + const Nbnxm::InteractionLocality iLocality) +{ + return nb.haveWork[iLocality]; +} + +bool +haveGpuShortRangeWork(const gmx_nbnxn_gpu_t *nb, + const Nbnxm::AtomLocality aLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); +} + + + /*! \brief Calculate atom range and return start index and length. * * \param[in] atomData Atom descriptor data structure @@ -319,7 +368,6 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings, bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, const int flags, const AtomLocality aloc, - const bool haveOtherWork, real *e_lj, real *e_el, rvec *fshift, @@ -331,8 +379,9 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc); // We skip when during the non-local phase there was actually no work to do. - // This is consistent with nbnxn_gpu_launch_kernel. - if (haveOtherWork || !canSkipWork(*nb, iLocality)) + // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible + // bonded GPU work. + if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality)) { // Query the state of the GPU stream and return early if we're not done if (completionKind == GpuTaskCompletion::Check) @@ -378,7 +427,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, * \param[in] nb The nonbonded data GPU structure * \param[in] flags Force flags * \param[in] aloc Atom locality identifier - * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s) * \param[out] e_lj Pointer to the LJ energy output to accumulate into * \param[out] e_el Pointer to the electrostatics energy output to accumulate into * \param[out] fshift Pointer to the shift force buffer to accumulate into @@ -387,12 +435,11 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, int flags, AtomLocality aloc, - bool haveOtherWork, real *e_lj, real *e_el, rvec *fshift) { - gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift, + gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift, GpuTaskCompletion::Wait); } diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h index 02febb47ab..77d3b08e96 100644 --- a/src/gromacs/nbnxm/gpu_common_utils.h +++ b/src/gromacs/nbnxm/gpu_common_utils.h @@ -64,8 +64,8 @@ namespace Nbnxm * local part of the force array also depends on the non-local kernel. * The skip of the local kernel is taken care of separately. */ -static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb, - InteractionLocality iloc) +static inline bool canSkipNonbondedWork(const gmx_nbnxn_gpu_t &nb, + InteractionLocality iloc) { assert(nb.plist[iloc]); return (iloc == InteractionLocality::NonLocal && diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 54c9ff9864..a81a6b400e 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -164,6 +164,13 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality rvec *f, gmx_wallcycle *wcycle) { + /* Skip the reduction if there was no short-range GPU work to do + * (either NB or both NB and bonded work). */ + if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality)) + { + return; + } + wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 72eb98ae59..7e3869b451 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -317,6 +317,23 @@ struct nonbonded_verlet_t void changePairlistRadii(real rlistOuter, real rlistInner); + //! Set up internal flags that indicate what type of short-range work there is. + void setupGpuShortRangeWork(const gmx::GpuBonded *gpuBonded, + const Nbnxm::InteractionLocality iLocality) + { + if (useGpu() && !emulateGpu()) + { + Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality); + } + } + + //! Returns true if there is GPU short-range work for the given atom locality. + bool haveGpuShortRangeWork(const Nbnxm::AtomLocality aLocality) + { + return ((useGpu() && !emulateGpu()) && + Nbnxm::haveGpuShortRangeWork(gpu_nbv, aLocality)); + } + // TODO: Make all data members private public: //! All data related to the pair lists diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 7e88129f4e..a1e4800456 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -55,6 +55,11 @@ struct nbnxn_atomdata_t; enum class GpuTaskCompletion; +namespace gmx +{ +class GpuBonded; +} + namespace Nbnxm { @@ -69,13 +74,11 @@ class Grid; * \param [in] nb GPU nonbonded data. * \param [in] nbdata Host-side atom data structure. * \param [in] aloc Atom locality flag. - * \param [in] haveOtherWork True if there are other tasks that require the nbnxn coordinate input. */ GPU_FUNC_QUALIFIER void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb, const struct nbnxn_atomdata_t gmx_unused *nbdata, - AtomLocality gmx_unused aloc, - bool gmx_unused haveOtherWork) GPU_FUNC_TERM + AtomLocality gmx_unused aloc) GPU_FUNC_TERM /*! \brief * Launch asynchronously the nonbonded force calculations. @@ -133,17 +136,14 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, int gmx_unused numParts) GPU_FUNC_TERM /*! \brief - * Launch asynchronously the download of nonbonded forces from the GPU + * Launch asynchronously the download of short-range forces from the GPU * (and energies/shift forces if required). - * When haveOtherWork=true, the copy-back is done even when there was - * no non-bonded work. */ GPU_FUNC_QUALIFIER void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, nbnxn_atomdata_t gmx_unused *nbatom, int gmx_unused flags, - AtomLocality gmx_unused aloc, - bool gmx_unused haveOtherWork) GPU_FUNC_TERM + AtomLocality gmx_unused aloc) GPU_FUNC_TERM /*! \brief Attempts to complete nonbonded GPU task. * @@ -171,7 +171,6 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, * \param[in] nb The nonbonded data GPU structure * \param[in] flags Force flags * \param[in] aloc Atom locality identifier - * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s) * \param[out] e_lj Pointer to the LJ energy output to accumulate into * \param[out] e_el Pointer to the electrostatics energy output to accumulate into * \param[out] fshift Pointer to the shift force buffer to accumulate into @@ -182,7 +181,6 @@ GPU_FUNC_QUALIFIER bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, int gmx_unused flags, AtomLocality gmx_unused aloc, - bool gmx_unused haveOtherWork, real gmx_unused *e_lj, real gmx_unused *e_el, rvec gmx_unused *fshift, @@ -198,7 +196,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, * \param[in] nb The nonbonded data GPU structure * \param[in] flags Force flags * \param[in] aloc Atom locality identifier - * \param[in] haveOtherWork Tells whether there is other work than non-bonded work in the nbnxn stream(s) * \param[out] e_lj Pointer to the LJ energy output to accumulate into * \param[out] e_el Pointer to the electrostatics energy output to accumulate into * \param[out] fshift Pointer to the shift force buffer to accumulate into @@ -207,7 +204,6 @@ GPU_FUNC_QUALIFIER void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, int gmx_unused flags, AtomLocality gmx_unused aloc, - bool gmx_unused haveOtherWork, real gmx_unused *e_lj, real gmx_unused *e_el, rvec gmx_unused *fshift) GPU_FUNC_TERM @@ -242,6 +238,37 @@ CUDA_FUNC_QUALIFIER void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused *nb, const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM +/*! \brief Set up internal flags that indicate what type of short-range work there is. + * + * As nonbondeds and bondeds share input/output buffers and GPU queues, + * both are considered when checking for work in the current domain. + * + * This function is expected to be called every time the work-distribution + * can change (i.e. at search/domain decomposition steps). + * + * \param[inout] nb Pointer to the nonbonded GPU data structure + * \param[in] gpuBonded Pointer to the GPU bonded data structure + * \param[in] iLocality Interaction locality identifier + */ +GPU_FUNC_QUALIFIER +void setupGpuShortRangeWork(gmx_nbnxn_gpu_t gmx_unused *nb, + const gmx::GpuBonded gmx_unused *gpuBonded, + const Nbnxm::InteractionLocality gmx_unused iLocality) GPU_FUNC_TERM + +/*! \brief Returns true if there is GPU short-range work for the given atom locality. + * + * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, + * and therefore if there are GPU offloaded bonded interactions, this function will return + * true for both local and nonlocal atom range. + * + * \param[inout] nb Pointer to the nonbonded GPU data structure + * \param[in] aLocality Atom locality identifier + */ +GPU_FUNC_QUALIFIER +bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused *nb, + const Nbnxm::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false) + + } // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index d3b860e6e6..659c484287 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -365,9 +365,10 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event) /*! \brief Launch asynchronously the xq buffer host to device copy. */ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, const nbnxn_atomdata_t *nbatom, - const AtomLocality atomLocality, - const bool haveOtherWork) + const AtomLocality atomLocality) { + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); /* local/nonlocal offset and length used for xq and f */ @@ -389,7 +390,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, we always call the local local x+q copy (and the rest of the local work in nbnxn_gpu_launch_kernel(). */ - if (!haveOtherWork && canSkipWork(*nb, iloc)) + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { plist->haveFreshList = false; @@ -491,7 +492,7 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, clearing. All these operations, except for the local interaction kernel, are needed for the non-local interactions. The skip of the local kernel call is taken care of later in this function. */ - if (canSkipWork(*nb, iloc)) + if (canSkipNonbondedWork(*nb, iloc)) { plist->haveFreshList = false; @@ -733,9 +734,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, struct nbnxn_atomdata_t *nbatom, const int flags, - const AtomLocality aloc, - const bool haveOtherWork) + const AtomLocality aloc) { + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + cl_int gmx_unused cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ @@ -752,7 +754,7 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, /* don't launch non-local copy-back if there was no non-local work to do */ - if (!haveOtherWork && canSkipWork(*nb, iloc)) + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { /* TODO An alternative way to signal that non-local work is complete is to use a clEnqueueMarker+clEnqueueBarrier diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 57d945bdcb..55d93e74f5 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -298,6 +298,12 @@ struct gmx_nbnxn_ocl_t non-local force calculations are done (e.g. f buffer 0-ing, local x/q H2D) */ + //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled + // to be executed in the current domain. As long as bonded work is not split up into + // local/nonlocal, if there is bonded GPU work, both flags will be true. + gmx::EnumerationArray haveWork; + + cl_bool bDoTime; /**< True if event-based timing is enabled. */ cl_timers_t *timers; /**< OpenCL event-based timers. */ struct gmx_wallclock_gpu_nbnxn_t *timings; /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */ -- 2.22.0