From 615bd80a1afb58a9448ddb6426103dc07b677bda Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 9 Mar 2021 19:17:15 +0300 Subject: [PATCH] Unify gpu_copy_xq_to_gpu(...) function Refs. #2608 --- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 70 -------------- src/gromacs/nbnxm/gpu_common.h | 108 ---------------------- src/gromacs/nbnxm/gpu_common_utils.h | 89 +++++++++++++++++- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 88 ++++++++++++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 70 -------------- src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp | 57 ------------ 6 files changed, 175 insertions(+), 307 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 0ddaa17b14..6ed6c1ff78 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -447,76 +447,6 @@ void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality in } } -/*! \brief Launch asynchronously the xq buffer host to device copy. */ -void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - - - NBAtomData* adat = nb->atdat; - gpu_plist* plist = nb->plist[iloc]; - Nbnxm::GpuTimers* timers = nb->timers; - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - - bool bDoTime = nb->bDoTime; - - /* Don't launch the non-local H2D copy if there is no dependent - work to do: neither non-local nor other (e.g. bonded) work - to do that has as input the nbnxn coordaintes. - Doing the same for the local kernel is more complicated, since the - local part of the force array also depends on the non-local kernel. - So to avoid complicating the code and to reduce the risk of bugs, - we always call the local local x+q copy (and the rest of the local - work in nbnxn_gpu_launch_kernel(). - */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - plist->haveFreshList = false; - - // The event is marked for Local interactions unconditionally, - // so it has to be released here because of the early return - // for NonLocal interactions. - nb->misc_ops_and_local_H2D_done.reset(); - - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - /* beginning of timed HtoD section */ - if (bDoTime) - { - timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); - } - - /* HtoD x, q */ - static_assert(sizeof(adat->xq[0]) == sizeof(Float4), - "The size of the xyzq buffer element should be equal to the size of float4."); - copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - nullptr); - - if (bDoTime) - { - timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); - } - - /* When we get here all misc operations issued in the local stream as well as - the local xq H2D are done, - so we record that in the local stream and wait for it in the nonlocal one. - This wait needs to precede any PP tasks, bonded or nonbonded, that may - compute on interactions between local and nonlocal atoms. - */ - nbnxnInsertNonlocalGpuDependency(nb, iloc); -} - /*! As we execute nonbonded workload in separate streams, before launching the kernel we need to make sure that he following operations have completed: - atomdata allocation and related H2D transfers (every nstlist step); diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index b2663b33f9..39c6d29057 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -60,15 +60,12 @@ #endif #include "gromacs/gpu_utils/gpu_utils.h" -#include "gromacs/listed_forces/gpubonded.h" #include "gromacs/math/vec.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/timing/wallcycle.h" -#include "gromacs/utility/fatalerror.h" -#include "gromacs/utility/range.h" #include "gromacs/utility/stringutil.h" #include "gpu_common_utils.h" @@ -82,111 +79,6 @@ class GpuBonded; namespace Nbnxm { -/*! \brief Check that atom locality values are valid for the GPU module. - * - * In the GPU module atom locality "all" is not supported, the local and - * non-local ranges are treated separately. - * - * \param[in] atomLocality atom locality specifier - */ -static inline void validateGpuAtomLocality(const AtomLocality atomLocality) -{ - std::string str = gmx::formatString( - "Invalid atom locality passed (%d); valid here is only " - "local (%d) or nonlocal (%d)", - static_cast(atomLocality), - static_cast(AtomLocality::Local), - static_cast(AtomLocality::NonLocal)); - - GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str()); -} - -/*! \brief Convert atom locality to interaction locality. - * - * In the current implementation the this is straightforward conversion: - * local to local, non-local to non-local. - * - * \param[in] atomLocality Atom locality specifier - * \returns Interaction locality corresponding to the atom locality passed. - */ -static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality) -{ - validateGpuAtomLocality(atomLocality); - - /* determine interaction locality from atom locality */ - if (atomLocality == AtomLocality::Local) - { - return InteractionLocality::Local; - } - else if (atomLocality == AtomLocality::NonLocal) - { - return InteractionLocality::NonLocal; - } - else - { - gmx_incons("Wrong locality"); - } -} - - -//NOLINTNEXTLINE(misc-definitions-in-headers) -void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - // There is short-range work if the pair list for the provided - // interaction locality contains entries or if there is any - // bonded work (as this is not split into local/nonlocal). - nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0) - || (gpuBonded != nullptr && gpuBonded->haveInteractions())); -} - -/*! \brief Returns true if there is GPU short-range work for the given interaction locality. - * - * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, - * and therefore if there are GPU offloaded bonded interactions, this function will return - * true for all interaction localities. - * - * \param[inout] nb Pointer to the nonbonded GPU data structure - * \param[in] iLocality Interaction locality identifier - */ -static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality) -{ - return nb.haveWork[iLocality]; -} - -//NOLINTNEXTLINE(misc-definitions-in-headers) -bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); -} - - -/*! \brief Calculate atom range and return start index and length. - * - * \param[in] atomData Atom descriptor data structure - * \param[in] atomLocality Atom locality specifier - * \returns Range of indexes for selected locality. - */ -static inline gmx::Range getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality) -{ - assert(atomData); - validateGpuAtomLocality(atomLocality); - - /* calculate the atom data index range based on locality */ - if (atomLocality == AtomLocality::Local) - { - return gmx::Range(0, atomData->numAtomsLocal); - } - else - { - return gmx::Range(atomData->numAtomsLocal, atomData->numAtoms); - } -} - - /*! \brief Count pruning kernel time if either kernel has been triggered * * We do the accounting for either of the two pruning kernel flavors: diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h index af0c69f36c..ea4f2d9d63 100644 --- a/src/gromacs/nbnxm/gpu_common_utils.h +++ b/src/gromacs/nbnxm/gpu_common_utils.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2017,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -44,7 +44,10 @@ #include "config.h" -#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/listed_forces/gpubonded.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/range.h" +#include "gromacs/nbnxm/nbnxm_gpu.h" #if GMX_GPU_CUDA # include "cuda/nbnxm_cuda_types.h" @@ -70,6 +73,88 @@ static inline bool canSkipNonbondedWork(const NbnxmGpu& nb, InteractionLocality return (iloc == InteractionLocality::NonLocal && nb.plist[iloc]->nsci == 0); } +/*! \brief Check that atom locality values are valid for the GPU module. + * + * In the GPU module atom locality "all" is not supported, the local and + * non-local ranges are treated separately. + * + * \param[in] atomLocality atom locality specifier + */ +static inline void validateGpuAtomLocality(const AtomLocality atomLocality) +{ + std::string str = gmx::formatString( + "Invalid atom locality passed (%d); valid here is only " + "local (%d) or nonlocal (%d)", + static_cast(atomLocality), + static_cast(AtomLocality::Local), + static_cast(AtomLocality::NonLocal)); + + GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str()); +} + +/*! \brief Convert atom locality to interaction locality. + * + * In the current implementation the this is straightforward conversion: + * local to local, non-local to non-local. + * + * \param[in] atomLocality Atom locality specifier + * \returns Interaction locality corresponding to the atom locality passed. + */ +static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality) +{ + validateGpuAtomLocality(atomLocality); + + /* determine interaction locality from atom locality */ + if (atomLocality == AtomLocality::Local) + { + return InteractionLocality::Local; + } + else if (atomLocality == AtomLocality::NonLocal) + { + return InteractionLocality::NonLocal; + } + else + { + gmx_incons("Wrong locality"); + } +} + +/*! \brief Returns true if there is GPU short-range work for the given interaction locality. + * + * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal, + * and therefore if there are GPU offloaded bonded interactions, this function will return + * true for all interaction localities. + * + * \param[inout] nb Pointer to the nonbonded GPU data structure + * \param[in] iLocality Interaction locality identifier + */ +static inline bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality) +{ + return nb.haveWork[iLocality]; +} + +/*! \brief Calculate atom range and return start index and length. + * + * \param[in] atomData Atom descriptor data structure + * \param[in] atomLocality Atom locality specifier + * \returns Range of indexes for selected locality. + */ +static inline gmx::Range getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality) +{ + assert(atomData); + validateGpuAtomLocality(atomLocality); + + /* calculate the atom data index range based on locality */ + if (atomLocality == AtomLocality::Local) + { + return gmx::Range(0, atomData->numAtomsLocal); + } + else + { + return gmx::Range(atomData->numAtomsLocal, atomData->numAtoms); + } +} + } // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 51f7745f7f..3fa7ad9c77 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -64,6 +64,7 @@ #include "gromacs/hardware/device_information.h" #include "gromacs/mdtypes/interaction_const.h" +#include "gromacs/nbnxm/gpu_common_utils.h" #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/cstringutil.h" @@ -410,4 +411,91 @@ enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinat } } +void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + // There is short-range work if the pair list for the provided + // interaction locality contains entries or if there is any + // bonded work (as this is not split into local/nonlocal). + nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0) + || (gpuBonded != nullptr && gpuBonded->haveInteractions())); +} + +bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); +} + +/*! \brief Launch asynchronously the xq buffer host to device copy. */ +void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + + NBAtomData* adat = nb->atdat; + gpu_plist* plist = nb->plist[iloc]; + Nbnxm::GpuTimers* timers = nb->timers; + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + + const bool bDoTime = nb->bDoTime; + + /* Don't launch the non-local H2D copy if there is no dependent + work to do: neither non-local nor other (e.g. bonded) work + to do that has as input the nbnxn coordaintes. + Doing the same for the local kernel is more complicated, since the + local part of the force array also depends on the non-local kernel. + So to avoid complicating the code and to reduce the risk of bugs, + we always call the local local x+q copy (and the rest of the local + work in nbnxn_gpu_launch_kernel(). + */ + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) + { + plist->haveFreshList = false; + + // The event is marked for Local interactions unconditionally, + // so it has to be released here because of the early return + // for NonLocal interactions. + nb->misc_ops_and_local_H2D_done.reset(); + + return; + } + + /* local/nonlocal offset and length used for xq and f */ + const auto atomsRange = getGpuAtomRange(adat, atomLocality); + + /* beginning of timed HtoD section */ + if (bDoTime) + { + timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); + } + + /* HtoD x, q */ + GMX_ASSERT(nbatom->XFormat == nbatXYZQ, + "The coordinates should be in xyzq format to copy to the Float4 device buffer."); + copyToDeviceBuffer(&adat->xq, + reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), + atomsRange.begin(), + atomsRange.size(), + deviceStream, + GpuApiCallBehavior::Async, + nullptr); + + if (bDoTime) + { + timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); + } + + /* When we get here all misc operations issued in the local stream as well as + the local xq H2D are done, + so we record that in the local stream and wait for it in the nonlocal one. + This wait needs to precede any PP tasks, bonded or nonbonded, that may + compute on interactions between local and nonlocal atoms. + */ + nbnxnInsertNonlocalGpuDependency(nb, iloc); +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index b8107fc857..8f3e58fdd6 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -520,76 +520,6 @@ void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality in } } -/*! \brief Launch asynchronously the xq buffer host to device copy. */ -void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - - NBAtomData* adat = nb->atdat; - gpu_plist* plist = nb->plist[iloc]; - Nbnxm::GpuTimers* timers = nb->timers; - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - - bool bDoTime = nb->bDoTime; - - /* Don't launch the non-local H2D copy if there is no dependent - work to do: neither non-local nor other (e.g. bonded) work - to do that has as input the nbnxn coordinates. - Doing the same for the local kernel is more complicated, since the - local part of the force array also depends on the non-local kernel. - So to avoid complicating the code and to reduce the risk of bugs, - we always call the local local x+q copy (and the rest of the local - work in nbnxn_gpu_launch_kernel(). - */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - plist->haveFreshList = false; - - // The event is marked for Local interactions unconditionally, - // so it has to be released here because of the early return - // for NonLocal interactions. - nb->misc_ops_and_local_H2D_done.reset(); - - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - /* beginning of timed HtoD section */ - if (bDoTime) - { - timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); - } - - /* HtoD x, q */ - static_assert(sizeof(float) == sizeof(*nbatom->x().data()), - "The size of the xyzq buffer element should be equal to the size of float4."); - copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); - - if (bDoTime) - { - timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); - } - - /* When we get here all misc operations issued in the local stream as well as - the local xq H2D are done, - so we record that in the local stream and wait for it in the nonlocal one. - This wait needs to precede any PP tasks, bonded or nonbonded, that may - compute on interactions between local and nonlocal atoms. - */ - nbnxnInsertNonlocalGpuDependency(nb, iloc); -} - - /*! \brief Launch GPU kernel As we execute nonbonded workload in separate queues, before launching diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index 7428869888..1f57c36bf6 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -160,63 +160,6 @@ void gpu_launch_cpyback(NbnxmGpu* nb, } } -/*! \brief Launch asynchronously the xq buffer host to device copy. */ -void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - validateGpuAtomLocality(atomLocality); - - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - - NBAtomData* adat = nb->atdat; - gpu_plist* plist = nb->plist[iloc]; - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - - /* Don't launch the non-local H2D copy if there is no dependent - work to do: neither non-local nor other (e.g. bonded) work - to do that has as input the nbnxn coordinates. - Doing the same for the local kernel is more complicated, since the - local part of the force array also depends on the non-local kernel. - So to avoid complicating the code and to reduce the risk of bugs, - we always call the local local x+q copy (and the rest of the local - work in nbnxn_gpu_launch_kernel(). - */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - plist->haveFreshList = false; - - // The event is marked for Local interactions unconditionally, - // so it has to be released here because of the early return - // for NonLocal interactions. - nb->misc_ops_and_local_H2D_done.reset(); - - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - /* HtoD x, q */ - GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4), - "The size of the xyzq buffer element should be equal to the size of float4."); - copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - nullptr); - - /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL. - * Runtime should do the scheduling correctly based on data dependencies. - * But for consistency's sake, we do it anyway. */ - /* When we get here all misc operations issued in the local stream as well as - * the local xq H2D are done, so we record that in the local stream and wait for it in the - * nonlocal one. This wait needs to precede any PP tasks, bonded or nonbonded, that may - * compute on interactions between local and nonlocal atoms. */ - nbnxnInsertNonlocalGpuDependency(nb, iloc); -} - void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { gpu_plist* plist = nb->plist[iloc]; -- 2.22.0