Fixes bug when there are more than two grids (ie they don't naively
map to local/nonlocal), by using a separate GPU memory space for
cxy_na and cxy_ind data for each grid (it previously only had one
local and one nonlocal space with overwriting). Also simpifies by now
only calling init fn once per NS step to set up data for all grids
(previously called for both local and nonlocal).
Change-Id: Ia2b97d22324aa97dca34b05da2eca2e2090372af
- nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
&top->excls, step, nrnb);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
&top->excls, step, nrnb);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
-
- if (useGpuXBufOps)
- {
-
- nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
- }
stateInstance = std::make_unique<t_state>();
state = stateInstance.get();
stateInstance = std::make_unique<t_state>();
state = stateInstance.get();
- if (fr->nbv->useGpu())
- {
- changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
- }
dd_init_local_state(cr->dd, state_global, state);
/* Distribute the charge groups over the nodes from the master node */
dd_init_local_state(cr->dd, state_global, state);
/* Distribute the charge groups over the nodes from the master node */
+ if (fr->nbv->useGpu())
+ {
+ changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+ }
+
// NOTE: The global state is no longer used at this point.
// But state_global is still used as temporary storage space for writing
// the global state to file and potentially for replica exchange.
// NOTE: The global state is no longer used at this point.
// But state_global is still used as temporary storage space for writing
// the global state to file and potentially for replica exchange.
gpu_nbv,
xPmeDevicePtr,
locality,
gpu_nbv,
xPmeDevicePtr,
locality,
+ x, g, gridSet.numColumnsMax());
*
* As the point where the local stream tasks can be considered complete happens
* at the same call point where the nonlocal stream should be synced with the
*
* As the point where the local stream tasks can be considered complete happens
* at the same call point where the nonlocal stream should be synced with the
- * the local, this function recrds the event if called with the local stream as
+ * the local, this function records the event if called with the local stream as
* argument and inserts in the GPU stream a wait on the event on the nonlocal.
*/
* argument and inserts in the GPU stream a wait on the event on the nonlocal.
*/
-static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
- const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
+ const InteractionLocality interactionLocality)
{
cudaStream_t stream = nb->stream[interactionLocality];
{
cudaStream_t stream = nb->stream[interactionLocality];
This wait needs to precede any PP tasks, bonded or nonbonded, that may
compute on interactions between local and nonlocal atoms.
*/
This wait needs to precede any PP tasks, bonded or nonbonded, that may
compute on interactions between local and nonlocal atoms.
*/
- insertNonlocalGpuDependency(nb, iloc);
+ nbnxnInsertNonlocalGpuDependency(nb, iloc);
}
/*! As we execute nonbonded workload in separate streams, before launching
}
/*! As we execute nonbonded workload in separate streams, before launching
gmx_nbnxn_gpu_t *nb,
void *xPmeDevicePtr,
const Nbnxm::AtomLocality locality,
gmx_nbnxn_gpu_t *nb,
void *xPmeDevicePtr,
const Nbnxm::AtomLocality locality,
+ const rvec *x,
+ int gridId,
+ int numColumnsMax)
{
cu_atomdata_t *adat = nb->atdat;
bool bDoTime = nb->bDoTime;
{
cu_atomdata_t *adat = nb->atdat;
bool bDoTime = nb->bDoTime;
const int numColumns = grid.numColumns();
const int cellOffset = grid.cellOffset();
const int numAtomsPerCell = grid.numAtomsPerCell();
const int numColumns = grid.numColumns();
const int cellOffset = grid.cellOffset();
const int numAtomsPerCell = grid.numAtomsPerCell();
- // TODO: Document this, one can not infer the interaction locality from the atom locality
- Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local;
- int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
- int copyAtomStart = grid.srcAtomBegin();
+ Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
+ int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
+ int copyAtomStart = grid.srcAtomBegin();
- if (locality == Nbnxm::AtomLocality::NonLocal)
- {
- interactionLoc = Nbnxm::InteractionLocality::NonLocal;
- }
-
- cudaStream_t stream = nb->stream[interactionLoc];
+ cudaStream_t stream = nb->stream[interactionLoc];
// FIXME: need to either let the local stream get to the
// insertNonlocalGpuDependency call or call it separately here
// FIXME: need to either let the local stream get to the
// insertNonlocalGpuDependency call or call it separately here
{
if (interactionLoc == Nbnxm::InteractionLocality::Local)
{
{
if (interactionLoc == Nbnxm::InteractionLocality::Local)
{
- insertNonlocalGpuDependency(nb, interactionLoc);
+ nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
config.sharedMemorySize = 0;
config.stream = stream;
config.sharedMemorySize = 0;
config.stream = stream;
- auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
- float *xqPtr = &(adat->xq->x);
- const int *d_atomIndices = nb->atomIndices;
- const int *d_cxy_na = nb->cxy_na[locality];
- const int *d_cxy_ind = nb->cxy_ind[locality];
- const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
- &numColumns,
- &xqPtr,
- &setFillerCoords,
- &d_x,
- &d_atomIndices,
- &d_cxy_na,
- &d_cxy_ind,
- &cellOffset,
- &numAtomsPerCell);
+ auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
+ float *xqPtr = &(adat->xq->x);
+ const int *d_atomIndices = nb->atomIndices;
+ const int *d_cxy_na = &nb->cxy_na[numColumnsMax*gridId];
+ const int *d_cxy_ind = &nb->cxy_ind[numColumnsMax*gridId];
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
+ &numColumns,
+ &xqPtr,
+ &setFillerCoords,
+ &d_x,
+ &d_atomIndices,
+ &d_cxy_na,
+ &d_cxy_ind,
+ &cellOffset,
+ &numAtomsPerCell);
launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
- insertNonlocalGpuDependency(nb, interactionLoc);
+ nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
cuda_init_const(nb, ic, listParams, nbat->params());
cuda_init_const(nb, ic, listParams, nbat->params());
- nb->natoms = 0;
- nb->natoms_alloc = 0;
- nb->atomIndicesSize = 0;
- nb->atomIndicesSize_alloc = 0;
- nb->ncxy_na[AtomLocality::Local] = 0;
- nb->ncxy_na[AtomLocality::NonLocal] = 0;
- nb->ncxy_na_alloc[AtomLocality::Local] = 0;
- nb->ncxy_na_alloc[AtomLocality::NonLocal] = 0;
- nb->ncxy_ind[AtomLocality::Local] = 0;
- nb->ncxy_ind[AtomLocality::NonLocal] = 0;
- nb->ncxy_ind_alloc[AtomLocality::Local] = 0;
- nb->ncxy_ind_alloc[AtomLocality::NonLocal] = 0;
+ nb->natoms = 0;
+ nb->natoms_alloc = 0;
+ nb->atomIndicesSize = 0;
+ nb->atomIndicesSize_alloc = 0;
+ nb->ncxy_na = 0;
+ nb->ncxy_na_alloc = 0;
+ nb->ncxy_ind = 0;
+ nb->ncxy_ind_alloc = 0;
/* Initialization for X buffer operations on GPU. */
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
/* Initialization for X buffer operations on GPU. */
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
- gmx_nbnxn_gpu_t *gpu_nbv,
- const Nbnxm::AtomLocality locality)
+ gmx_nbnxn_gpu_t *gpu_nbv)
- const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
- InteractionLocality::Local : InteractionLocality::NonLocal);
- cudaStream_t stream = gpu_nbv->stream[iloc];
+ cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
bool bDoTime = gpu_nbv->bDoTime;
bool bDoTime = gpu_nbv->bDoTime;
- int gridBegin = 0, gridEnd = 0;
+ const int maxNumColumns = gridSet.numColumnsMax();
- switch (locality)
- {
- case Nbnxm::AtomLocality::All:
- gridBegin = 0;
- gridEnd = gridSet.grids().size();
- break;
- case Nbnxm::AtomLocality::Local:
- gridBegin = 0;
- gridEnd = 1;
- break;
- case Nbnxm::AtomLocality::NonLocal:
- gridBegin = 1;
- gridEnd = gridSet.grids().size();
- break;
- case Nbnxm::AtomLocality::Count:
- GMX_ASSERT(false, "Count is invalid locality specifier");
- break;
- }
- for (int g = gridBegin; g < gridEnd; g++)
+ reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+ &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+ reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+ &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+
+ for (unsigned int g = 0; g < gridSet.grids().size(); g++)
{
const Nbnxm::Grid &grid = gridSet.grids()[g];
{
const Nbnxm::Grid &grid = gridSet.grids()[g];
const int *cxy_ind = grid.cxy_ind().data();
const int numRealAtomsTotal = gridSet.numRealAtomsTotal();
const int *cxy_ind = grid.cxy_ind().data();
const int numRealAtomsTotal = gridSet.numRealAtomsTotal();
- if (iloc == Nbnxm::InteractionLocality::Local)
- {
+ reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+ reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
- reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
- reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+ if (atomIndicesSize > 0)
+ {
+ // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+ stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
+ CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
- if (atomIndicesSize > 0)
- // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
- stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
- CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
- }
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ }
- copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
- if (bDoTime)
- {
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
- }
-
- stat = cudaHostUnregister((void*) atomIndices);
- CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
- reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr);
- reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr);
+ stat = cudaHostUnregister((void*) atomIndices);
+ CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+ }
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
- copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+ copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
}
stat = cudaHostUnregister((void*) cxy_na);
}
stat = cudaHostUnregister((void*) cxy_na);
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
- copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+ copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
}
stat = cudaHostUnregister((void*) cxy_ind);
CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
}
}
}
stat = cudaHostUnregister((void*) cxy_ind);
CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
}
}
+
+ // The above data is transferred on the local stream but is a
+ // dependency of the nonlocal stream (specifically the nonlocal X
+ // buf ops kernel). We therefore set a dependency to ensure
+ // that the nonlocal stream waits on the local stream here.
+ // This call records an event in the local stream:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+ // ...and this call instructs the nonlocal stream to wait on that event:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+
int atomIndicesSize;
//! size of atom indices allocated in device buffer
int atomIndicesSize_alloc;
int atomIndicesSize;
//! size of atom indices allocated in device buffer
int atomIndicesSize_alloc;
- //! x buf ops num of atoms (local and non-local)
- gmx::EnumerationArray<Nbnxm::AtomLocality, int *> cxy_na;
+ //! x buf ops num of atoms
+ int *cxy_na;
//! number of elements in cxy_na
//! number of elements in cxy_na
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_na;
//! number of elements allocated allocated in device buffer
//! number of elements allocated allocated in device buffer
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_na_alloc;
- //! x buf ops cell index mapping (local and non-local)
- gmx::EnumerationArray<Nbnxm::AtomLocality, int *> cxy_ind;
+ int ncxy_na_alloc;
+ //! x buf ops cell index mapping
+ int *cxy_ind;
//! number of elements in cxy_ind
//! number of elements in cxy_ind
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_ind;
//! number of elements allocated allocated in device buffer
//! number of elements allocated allocated in device buffer
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_ind_alloc;
//! parameters required for the non-bonded calc.
cu_nbparam_t *nbparam;
//! pair-list data structures (local and non-local)
//! parameters required for the non-bonded calc.
cu_nbparam_t *nbparam;
//! pair-list data structures (local and non-local)
is done (and the local transfer can proceed) */
cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
the local stream that need to precede the
is done (and the local transfer can proceed) */
cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
the local stream that need to precede the
- non-local force calculations are done
- (e.g. f buffer 0-ing, local x/q H2D) */
+ non-local force or buffer operation calculations are done
+ (e.g. f buffer 0-ing, local x/q H2D, buffer op
+ initialization in local stream that is required also
+ by nonlocal stream ) */
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
/* We are done setting up all grids, we can resize the force buffers */
nbat->resizeForceBuffers();
}
/* We are done setting up all grids, we can resize the force buffers */
nbat->resizeForceBuffers();
}
+
+ int maxNumColumns = 0;
+ for (const auto &grid : grids())
+ {
+ maxNumColumns = std::max(maxNumColumns, grid.numColumns());
+ }
+ setNumColumnsMax(maxNumColumns);
+
+ //! Returns the maximum number of columns across all grids
+ int numColumnsMax() const
+ {
+ return numColumnsMax_;
+ }
+
+ //! Sets the maximum number of columns across all grids
+ void setNumColumnsMax(int numColumnsMax)
+ {
+ numColumnsMax_ = numColumnsMax;
+ }
+
private:
//! Returns collection of the data that covers all grids
const GridSetData getGridSetData()
private:
//! Returns collection of the data that covers all grids
const GridSetData getGridSetData()
int numRealAtomsTotal_;
//! Working data for constructing a single grid, one entry per thread
std::vector<GridWork> gridWork_;
int numRealAtomsTotal_;
//! Working data for constructing a single grid, one entry per thread
std::vector<GridWork> gridWork_;
+ //! Maximum number of columns across all grids
+ int numColumnsMax_;
+
-nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality locality)
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
+ Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
+}
- nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
- gpu_nbv,
- locality);
-
-
+void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality)
+{
+ Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
void *xPmeDevicePtr,
gmx_wallcycle *wcycle);
void *xPmeDevicePtr,
gmx_wallcycle *wcycle);
- //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
- void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality locality);
+ //! Init for GPU version of setup coordinates in Nbnxm
+ void atomdata_init_copy_x_to_nbat_x_gpu();
+ //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
+ void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality);
//! Returns a reference to the pairlist sets
const PairlistSets &pairlistSets() const
//! Returns a reference to the pairlist sets
const PairlistSets &pairlistSets() const
* Called on the NS step and performs (re-)allocations and memory copies. !*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
* Called on the NS step and performs (re-)allocations and memory copies. !*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM
/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*/
/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*/
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
void gmx_unused *xPmeDevicePtr,
Nbnxm::AtomLocality gmx_unused locality,
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
void gmx_unused *xPmeDevicePtr,
Nbnxm::AtomLocality gmx_unused locality,
- const rvec gmx_unused *x) CUDA_FUNC_TERM
+ const rvec gmx_unused *x,
+ int gmx_unused gridId,
+ int gmx_unused numColumnsMax) CUDA_FUNC_TERM
+
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] interactionLocality Local or NonLocal sync point
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused *nb,
+ const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM