}
}
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
-
- NBAtomData* adat = nb->atdat;
- gpu_plist* plist = nb->plist[iloc];
- Nbnxm::GpuTimers* timers = nb->timers;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- bool bDoTime = nb->bDoTime;
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordaintes.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- // The event is marked for Local interactions unconditionally,
- // so it has to be released here because of the early return
- // for NonLocal interactions.
- nb->misc_ops_and_local_H2D_done.reset();
-
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* beginning of timed HtoD section */
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
- }
-
- /* HtoD x, q */
- static_assert(sizeof(adat->xq[0]) == sizeof(Float4),
- "The size of the xyzq buffer element should be equal to the size of float4.");
- copyToDeviceBuffer(&adat->xq,
- reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- nullptr);
-
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
- }
-
- /* When we get here all misc operations issued in the local stream as well as
- the local xq H2D are done,
- so we record that in the local stream and wait for it in the nonlocal one.
- This wait needs to precede any PP tasks, bonded or nonbonded, that may
- compute on interactions between local and nonlocal atoms.
- */
- nbnxnInsertNonlocalGpuDependency(nb, iloc);
-}
-
/*! As we execute nonbonded workload in separate streams, before launching
the kernel we need to make sure that he following operations have completed:
- atomdata allocation and related H2D transfers (every nstlist step);
#endif
#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/listed_forces/gpubonded.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/timing/wallcycle.h"
-#include "gromacs/utility/fatalerror.h"
-#include "gromacs/utility/range.h"
#include "gromacs/utility/stringutil.h"
#include "gpu_common_utils.h"
namespace Nbnxm
{
-/*! \brief Check that atom locality values are valid for the GPU module.
- *
- * In the GPU module atom locality "all" is not supported, the local and
- * non-local ranges are treated separately.
- *
- * \param[in] atomLocality atom locality specifier
- */
-static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
-{
- std::string str = gmx::formatString(
- "Invalid atom locality passed (%d); valid here is only "
- "local (%d) or nonlocal (%d)",
- static_cast<int>(atomLocality),
- static_cast<int>(AtomLocality::Local),
- static_cast<int>(AtomLocality::NonLocal));
-
- GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
-}
-
-/*! \brief Convert atom locality to interaction locality.
- *
- * In the current implementation the this is straightforward conversion:
- * local to local, non-local to non-local.
- *
- * \param[in] atomLocality Atom locality specifier
- * \returns Interaction locality corresponding to the atom locality passed.
- */
-static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
-{
- validateGpuAtomLocality(atomLocality);
-
- /* determine interaction locality from atom locality */
- if (atomLocality == AtomLocality::Local)
- {
- return InteractionLocality::Local;
- }
- else if (atomLocality == AtomLocality::NonLocal)
- {
- return InteractionLocality::NonLocal;
- }
- else
- {
- gmx_incons("Wrong locality");
- }
-}
-
-
-//NOLINTNEXTLINE(misc-definitions-in-headers)
-void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- // There is short-range work if the pair list for the provided
- // interaction locality contains entries or if there is any
- // bonded work (as this is not split into local/nonlocal).
- nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
- || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
-}
-
-/*! \brief Returns true if there is GPU short-range work for the given interaction locality.
- *
- * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
- * and therefore if there are GPU offloaded bonded interactions, this function will return
- * true for all interaction localities.
- *
- * \param[inout] nb Pointer to the nonbonded GPU data structure
- * \param[in] iLocality Interaction locality identifier
- */
-static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality)
-{
- return nb.haveWork[iLocality];
-}
-
-//NOLINTNEXTLINE(misc-definitions-in-headers)
-bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
-}
-
-
-/*! \brief Calculate atom range and return start index and length.
- *
- * \param[in] atomData Atom descriptor data structure
- * \param[in] atomLocality Atom locality specifier
- * \returns Range of indexes for selected locality.
- */
-static inline gmx::Range<int> getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality)
-{
- assert(atomData);
- validateGpuAtomLocality(atomLocality);
-
- /* calculate the atom data index range based on locality */
- if (atomLocality == AtomLocality::Local)
- {
- return gmx::Range<int>(0, atomData->numAtomsLocal);
- }
- else
- {
- return gmx::Range<int>(atomData->numAtomsLocal, atomData->numAtoms);
- }
-}
-
-
/*! \brief Count pruning kernel time if either kernel has been triggered
*
* We do the accounting for either of the two pruning kernel flavors:
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "config.h"
-#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/range.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
#if GMX_GPU_CUDA
# include "cuda/nbnxm_cuda_types.h"
return (iloc == InteractionLocality::NonLocal && nb.plist[iloc]->nsci == 0);
}
+/*! \brief Check that atom locality values are valid for the GPU module.
+ *
+ * In the GPU module atom locality "all" is not supported, the local and
+ * non-local ranges are treated separately.
+ *
+ * \param[in] atomLocality atom locality specifier
+ */
+static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
+{
+ std::string str = gmx::formatString(
+ "Invalid atom locality passed (%d); valid here is only "
+ "local (%d) or nonlocal (%d)",
+ static_cast<int>(atomLocality),
+ static_cast<int>(AtomLocality::Local),
+ static_cast<int>(AtomLocality::NonLocal));
+
+ GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
+}
+
+/*! \brief Convert atom locality to interaction locality.
+ *
+ * In the current implementation the this is straightforward conversion:
+ * local to local, non-local to non-local.
+ *
+ * \param[in] atomLocality Atom locality specifier
+ * \returns Interaction locality corresponding to the atom locality passed.
+ */
+static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
+{
+ validateGpuAtomLocality(atomLocality);
+
+ /* determine interaction locality from atom locality */
+ if (atomLocality == AtomLocality::Local)
+ {
+ return InteractionLocality::Local;
+ }
+ else if (atomLocality == AtomLocality::NonLocal)
+ {
+ return InteractionLocality::NonLocal;
+ }
+ else
+ {
+ gmx_incons("Wrong locality");
+ }
+}
+
+/*! \brief Returns true if there is GPU short-range work for the given interaction locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for all interaction localities.
+ *
+ * \param[inout] nb Pointer to the nonbonded GPU data structure
+ * \param[in] iLocality Interaction locality identifier
+ */
+static inline bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality)
+{
+ return nb.haveWork[iLocality];
+}
+
+/*! \brief Calculate atom range and return start index and length.
+ *
+ * \param[in] atomData Atom descriptor data structure
+ * \param[in] atomLocality Atom locality specifier
+ * \returns Range of indexes for selected locality.
+ */
+static inline gmx::Range<int> getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality)
+{
+ assert(atomData);
+ validateGpuAtomLocality(atomLocality);
+
+ /* calculate the atom data index range based on locality */
+ if (atomLocality == AtomLocality::Local)
+ {
+ return gmx::Range<int>(0, atomData->numAtomsLocal);
+ }
+ else
+ {
+ return gmx::Range<int>(atomData->numAtomsLocal, atomData->numAtoms);
+ }
+}
+
} // namespace Nbnxm
#endif
#include "gromacs/hardware/device_information.h"
#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
}
}
+void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ // There is short-range work if the pair list for the provided
+ // interaction locality contains entries or if there is any
+ // bonded work (as this is not split into local/nonlocal).
+ nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
+ || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
+}
+
+bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
+}
+
+/*! \brief Launch asynchronously the xq buffer host to device copy. */
+void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+ NBAtomData* adat = nb->atdat;
+ gpu_plist* plist = nb->plist[iloc];
+ Nbnxm::GpuTimers* timers = nb->timers;
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+ const bool bDoTime = nb->bDoTime;
+
+ /* Don't launch the non-local H2D copy if there is no dependent
+ work to do: neither non-local nor other (e.g. bonded) work
+ to do that has as input the nbnxn coordaintes.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local local x+q copy (and the rest of the local
+ work in nbnxn_gpu_launch_kernel().
+ */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ plist->haveFreshList = false;
+
+ // The event is marked for Local interactions unconditionally,
+ // so it has to be released here because of the early return
+ // for NonLocal interactions.
+ nb->misc_ops_and_local_H2D_done.reset();
+
+ return;
+ }
+
+ /* local/nonlocal offset and length used for xq and f */
+ const auto atomsRange = getGpuAtomRange(adat, atomLocality);
+
+ /* beginning of timed HtoD section */
+ if (bDoTime)
+ {
+ timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+ }
+
+ /* HtoD x, q */
+ GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
+ "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
+ copyToDeviceBuffer(&adat->xq,
+ reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
+ atomsRange.begin(),
+ atomsRange.size(),
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ nullptr);
+
+ if (bDoTime)
+ {
+ timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+ }
+
+ /* When we get here all misc operations issued in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one.
+ This wait needs to precede any PP tasks, bonded or nonbonded, that may
+ compute on interactions between local and nonlocal atoms.
+ */
+ nbnxnInsertNonlocalGpuDependency(nb, iloc);
+}
+
} // namespace Nbnxm
}
}
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
- NBAtomData* adat = nb->atdat;
- gpu_plist* plist = nb->plist[iloc];
- Nbnxm::GpuTimers* timers = nb->timers;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- bool bDoTime = nb->bDoTime;
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordinates.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- // The event is marked for Local interactions unconditionally,
- // so it has to be released here because of the early return
- // for NonLocal interactions.
- nb->misc_ops_and_local_H2D_done.reset();
-
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* beginning of timed HtoD section */
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
- }
-
- /* HtoD x, q */
- static_assert(sizeof(float) == sizeof(*nbatom->x().data()),
- "The size of the xyzq buffer element should be equal to the size of float4.");
- copyToDeviceBuffer(&adat->xq,
- reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
- }
-
- /* When we get here all misc operations issued in the local stream as well as
- the local xq H2D are done,
- so we record that in the local stream and wait for it in the nonlocal one.
- This wait needs to precede any PP tasks, bonded or nonbonded, that may
- compute on interactions between local and nonlocal atoms.
- */
- nbnxnInsertNonlocalGpuDependency(nb, iloc);
-}
-
-
/*! \brief Launch GPU kernel
As we execute nonbonded workload in separate queues, before launching
}
}
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- validateGpuAtomLocality(atomLocality);
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
- NBAtomData* adat = nb->atdat;
- gpu_plist* plist = nb->plist[iloc];
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordinates.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- // The event is marked for Local interactions unconditionally,
- // so it has to be released here because of the early return
- // for NonLocal interactions.
- nb->misc_ops_and_local_H2D_done.reset();
-
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* HtoD x, q */
- GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4),
- "The size of the xyzq buffer element should be equal to the size of float4.");
- copyToDeviceBuffer(&adat->xq,
- reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- nullptr);
-
- /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL.
- * Runtime should do the scheduling correctly based on data dependencies.
- * But for consistency's sake, we do it anyway. */
- /* When we get here all misc operations issued in the local stream as well as
- * the local xq H2D are done, so we record that in the local stream and wait for it in the
- * nonlocal one. This wait needs to precede any PP tasks, bonded or nonbonded, that may
- * compute on interactions between local and nonlocal atoms. */
- nbnxnInsertNonlocalGpuDependency(nb, iloc);
-}
-
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
gpu_plist* plist = nb->plist[iloc];