return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fShift);
}
-/* Initialization for X buffer operations on GPU. */
-/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
-void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
-{
- const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
- bool bDoTime = gpu_nbv->bDoTime;
- const int maxNumColumns = gridSet.numColumnsMax();
-
- reallocateDeviceBuffer(&gpu_nbv->cxy_na,
- maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_na,
- &gpu_nbv->ncxy_na_alloc,
- *gpu_nbv->deviceContext_);
- reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
- maxNumColumns * gridSet.grids().size(),
- &gpu_nbv->ncxy_ind,
- &gpu_nbv->ncxy_ind_alloc,
- *gpu_nbv->deviceContext_);
-
- for (unsigned int g = 0; g < gridSet.grids().size(); g++)
- {
-
- const Nbnxm::Grid& grid = gridSet.grids()[g];
-
- const int numColumns = grid.numColumns();
- const int* atomIndices = gridSet.atomIndices().data();
- const int atomIndicesSize = gridSet.atomIndices().size();
- const int* cxy_na = grid.cxy_na().data();
- const int* cxy_ind = grid.cxy_ind().data();
-
- reallocateDeviceBuffer(&gpu_nbv->atomIndices,
- atomIndicesSize,
- &gpu_nbv->atomIndicesSize,
- &gpu_nbv->atomIndicesSize_alloc,
- *gpu_nbv->deviceContext_);
-
- if (atomIndicesSize > 0)
- {
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
- }
-
- copyToDeviceBuffer(&gpu_nbv->atomIndices,
- atomIndices,
- 0,
- atomIndicesSize,
- localStream,
- GpuApiCallBehavior::Async,
- nullptr);
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
- }
- }
-
- if (numColumns > 0)
- {
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
- }
-
- int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
- copyToDeviceBuffer(
- &destPtr, cxy_na, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
- }
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
- }
-
- destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
- copyToDeviceBuffer(
- &destPtr, cxy_ind, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
- }
- }
- }
-
- if (gpu_nbv->bUseTwoStreams)
- {
- // The above data is transferred on the local stream but is a
- // dependency of the nonlocal stream (specifically the nonlocal X
- // buf ops kernel). We therefore set a dependency to ensure
- // that the nonlocal stream waits on the local stream here.
- // This call records an event in the local stream:
- gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
- *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
- // ...and this call instructs the nonlocal stream to wait on that event:
- gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
- *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
- }
-
- return;
-}
-
} // namespace Nbnxm
/*! \brief Initialization for X buffer operations on GPU.
* Called on the NS step and performs (re-)allocations and memory copies. !*/
-CUDA_FUNC_QUALIFIER
+GPU_FUNC_QUALIFIER
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
- NbnxmGpu gmx_unused* gpu_nbv) CUDA_FUNC_TERM;
+ NbnxmGpu gmx_unused* gpu_nbv) GPU_FUNC_TERM;
/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/pbcutil/ishift.h"
nbnxnInsertNonlocalGpuDependency(nb, iloc);
}
+
+/* Initialization for X buffer operations on GPU. */
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
+{
+ const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
+ const bool bDoTime = gpu_nbv->bDoTime;
+ const int maxNumColumns = gridSet.numColumnsMax();
+
+ reallocateDeviceBuffer(&gpu_nbv->cxy_na,
+ maxNumColumns * gridSet.grids().size(),
+ &gpu_nbv->ncxy_na,
+ &gpu_nbv->ncxy_na_alloc,
+ *gpu_nbv->deviceContext_);
+ reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
+ maxNumColumns * gridSet.grids().size(),
+ &gpu_nbv->ncxy_ind,
+ &gpu_nbv->ncxy_ind_alloc,
+ *gpu_nbv->deviceContext_);
+
+ for (unsigned int g = 0; g < gridSet.grids().size(); g++)
+ {
+ const Nbnxm::Grid& grid = gridSet.grids()[g];
+
+ const int numColumns = grid.numColumns();
+ const int* atomIndices = gridSet.atomIndices().data();
+ const int atomIndicesSize = gridSet.atomIndices().size();
+ const int* cxy_na = grid.cxy_na().data();
+ const int* cxy_ind = grid.cxy_ind().data();
+
+ auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
+
+ reallocateDeviceBuffer(&gpu_nbv->atomIndices,
+ atomIndicesSize,
+ &gpu_nbv->atomIndicesSize,
+ &gpu_nbv->atomIndicesSize_alloc,
+ *gpu_nbv->deviceContext_);
+
+ if (atomIndicesSize > 0)
+ {
+ if (bDoTime)
+ {
+ timerH2D->openTimingRegion(localStream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->atomIndices,
+ atomIndices,
+ 0,
+ atomIndicesSize,
+ localStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ timerH2D->closeTimingRegion(localStream);
+ }
+ }
+
+ if (numColumns > 0)
+ {
+ if (bDoTime)
+ {
+ timerH2D->openTimingRegion(localStream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->cxy_na,
+ cxy_na,
+ maxNumColumns * g,
+ numColumns,
+ localStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ timerH2D->closeTimingRegion(localStream);
+ }
+
+ if (bDoTime)
+ {
+ timerH2D->openTimingRegion(localStream);
+ }
+
+ copyToDeviceBuffer(&gpu_nbv->cxy_ind,
+ cxy_ind,
+ maxNumColumns * g,
+ numColumns,
+ localStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ timerH2D->closeTimingRegion(localStream);
+ }
+ }
+ }
+
+ // The above data is transferred on the local stream but is a
+ // dependency of the nonlocal stream (specifically the nonlocal X
+ // buf ops kernel). We therefore set a dependency to ensure
+ // that the nonlocal stream waits on the local stream here.
+ // This call records an event in the local stream:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+ // ...and this call instructs the nonlocal stream to wait on that event:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+}
+
} // namespace Nbnxm
//! staging area where fshift/energies get downloaded
NBStagingData nbst;
+ // Data for GPU-side coordinate conversion between integrator and NBNXM
+ /*! \brief array of atom indices */
+ DeviceBuffer<int> atomIndices;
+ /*! \brief size of atom indices */
+ int atomIndicesSize = 0;
+ /*! \brief size of atom indices allocated in device buffer */
+ int atomIndicesSize_alloc = 0;
+ /*! \brief x buf ops num of atoms */
+ DeviceBuffer<int> cxy_na;
+ /*! \brief number of elements in cxy_na */
+ int ncxy_na = 0;
+ /*! \brief number of elements allocated allocated in device buffer */
+ int ncxy_na_alloc = 0;
+ /*! \brief x buf ops cell index mapping */
+ DeviceBuffer<int> cxy_ind;
+ /*! \brief number of elements in cxy_ind */
+ int ncxy_ind = 0;
+ /*! \brief number of elements allocated allocated in device buffer */
+ int ncxy_ind_alloc = 0;
+
//! local and non-local GPU queues
gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
/*! \brief atom data */
NBAtomData* atdat = nullptr;
+ // Data for GPU-side coordinate conversion between integrator and NBNXM
+ /*! \brief array of atom indices */
+ DeviceBuffer<int> atomIndices;
+ /*! \brief size of atom indices */
+ int atomIndicesSize = 0;
+ /*! \brief size of atom indices allocated in device buffer */
+ int atomIndicesSize_alloc = 0;
+ /*! \brief x buf ops num of atoms */
+ DeviceBuffer<int> cxy_na;
+ /*! \brief number of elements in cxy_na */
+ int ncxy_na = 0;
+ /*! \brief number of elements allocated allocated in device buffer */
+ int ncxy_na_alloc = 0;
+ /*! \brief x buf ops cell index mapping */
+ DeviceBuffer<int> cxy_ind;
+ /*! \brief number of elements in cxy_ind */
+ int ncxy_ind = 0;
+ /*! \brief number of elements allocated allocated in device buffer */
+ int ncxy_ind_alloc = 0;
+
NBParamGpu* nbparam = nullptr;
/*! \brief pair-list data structures (local and non-local) */
gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };