if (useGpuXBufOps)
{
- nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+ nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
}
&top->excls, step, nrnb);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
-
- if (useGpuXBufOps)
- {
-
- nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
- }
}
else
{
stateInstance = std::make_unique<t_state>();
state = stateInstance.get();
- if (fr->nbv->useGpu())
- {
- changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
- }
dd_init_local_state(cr->dd, state_global, state);
/* Distribute the charge groups over the nodes from the master node */
}
+ if (fr->nbv->useGpu())
+ {
+ changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+ }
+
// NOTE: The global state is no longer used at this point.
// But state_global is still used as temporary storage space for writing
// the global state to file and potentially for replica exchange.
gpu_nbv,
xPmeDevicePtr,
locality,
- x);
+ x, g, gridSet.numColumnsMax());
}
}
else
*
* As the point where the local stream tasks can be considered complete happens
* at the same call point where the nonlocal stream should be synced with the
- * the local, this function recrds the event if called with the local stream as
+ * the local, this function records the event if called with the local stream as
* argument and inserts in the GPU stream a wait on the event on the nonlocal.
*/
-static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
- const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb,
+ const InteractionLocality interactionLocality)
{
cudaStream_t stream = nb->stream[interactionLocality];
This wait needs to precede any PP tasks, bonded or nonbonded, that may
compute on interactions between local and nonlocal atoms.
*/
- insertNonlocalGpuDependency(nb, iloc);
+ nbnxnInsertNonlocalGpuDependency(nb, iloc);
}
/*! As we execute nonbonded workload in separate streams, before launching
gmx_nbnxn_gpu_t *nb,
void *xPmeDevicePtr,
const Nbnxm::AtomLocality locality,
- const rvec *x)
+ const rvec *x,
+ int gridId,
+ int numColumnsMax)
{
cu_atomdata_t *adat = nb->atdat;
bool bDoTime = nb->bDoTime;
const int numColumns = grid.numColumns();
const int cellOffset = grid.cellOffset();
const int numAtomsPerCell = grid.numAtomsPerCell();
- // TODO: Document this, one can not infer the interaction locality from the atom locality
- Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local;
- int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
- int copyAtomStart = grid.srcAtomBegin();
+ Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
+ int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
+ int copyAtomStart = grid.srcAtomBegin();
- if (locality == Nbnxm::AtomLocality::NonLocal)
- {
- interactionLoc = Nbnxm::InteractionLocality::NonLocal;
- }
-
- cudaStream_t stream = nb->stream[interactionLoc];
+ cudaStream_t stream = nb->stream[interactionLoc];
// FIXME: need to either let the local stream get to the
// insertNonlocalGpuDependency call or call it separately here
{
if (interactionLoc == Nbnxm::InteractionLocality::Local)
{
- insertNonlocalGpuDependency(nb, interactionLoc);
+ nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
return;
}
config.sharedMemorySize = 0;
config.stream = stream;
- auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
- float *xqPtr = &(adat->xq->x);
- const int *d_atomIndices = nb->atomIndices;
- const int *d_cxy_na = nb->cxy_na[locality];
- const int *d_cxy_ind = nb->cxy_ind[locality];
- const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
- &numColumns,
- &xqPtr,
- &setFillerCoords,
- &d_x,
- &d_atomIndices,
- &d_cxy_na,
- &d_cxy_ind,
- &cellOffset,
- &numAtomsPerCell);
+ auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel;
+ float *xqPtr = &(adat->xq->x);
+ const int *d_atomIndices = nb->atomIndices;
+ const int *d_cxy_na = &nb->cxy_na[numColumnsMax*gridId];
+ const int *d_cxy_ind = &nb->cxy_ind[numColumnsMax*gridId];
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
+ &numColumns,
+ &xqPtr,
+ &setFillerCoords,
+ &d_x,
+ &d_atomIndices,
+ &d_cxy_na,
+ &d_cxy_ind,
+ &cellOffset,
+ &numAtomsPerCell);
launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
- insertNonlocalGpuDependency(nb, interactionLoc);
+ nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
} // namespace Nbnxm
cuda_init_const(nb, ic, listParams, nbat->params());
- nb->natoms = 0;
- nb->natoms_alloc = 0;
- nb->atomIndicesSize = 0;
- nb->atomIndicesSize_alloc = 0;
- nb->ncxy_na[AtomLocality::Local] = 0;
- nb->ncxy_na[AtomLocality::NonLocal] = 0;
- nb->ncxy_na_alloc[AtomLocality::Local] = 0;
- nb->ncxy_na_alloc[AtomLocality::NonLocal] = 0;
- nb->ncxy_ind[AtomLocality::Local] = 0;
- nb->ncxy_ind[AtomLocality::NonLocal] = 0;
- nb->ncxy_ind_alloc[AtomLocality::Local] = 0;
- nb->ncxy_ind_alloc[AtomLocality::NonLocal] = 0;
+ nb->natoms = 0;
+ nb->natoms_alloc = 0;
+ nb->atomIndicesSize = 0;
+ nb->atomIndicesSize_alloc = 0;
+ nb->ncxy_na = 0;
+ nb->ncxy_na_alloc = 0;
+ nb->ncxy_ind = 0;
+ nb->ncxy_ind_alloc = 0;
if (debug)
{
/* Initialization for X buffer operations on GPU. */
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet,
- gmx_nbnxn_gpu_t *gpu_nbv,
- const Nbnxm::AtomLocality locality)
+ gmx_nbnxn_gpu_t *gpu_nbv)
{
cudaError_t stat;
- const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
- InteractionLocality::Local : InteractionLocality::NonLocal);
- cudaStream_t stream = gpu_nbv->stream[iloc];
+ cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
bool bDoTime = gpu_nbv->bDoTime;
- int gridBegin = 0, gridEnd = 0;
+ const int maxNumColumns = gridSet.numColumnsMax();
- switch (locality)
- {
- case Nbnxm::AtomLocality::All:
- gridBegin = 0;
- gridEnd = gridSet.grids().size();
- break;
- case Nbnxm::AtomLocality::Local:
- gridBegin = 0;
- gridEnd = 1;
- break;
- case Nbnxm::AtomLocality::NonLocal:
- gridBegin = 1;
- gridEnd = gridSet.grids().size();
- break;
- case Nbnxm::AtomLocality::Count:
- GMX_ASSERT(false, "Count is invalid locality specifier");
- break;
- }
- for (int g = gridBegin; g < gridEnd; g++)
+ reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+ &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+ reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+ &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+
+ for (unsigned int g = 0; g < gridSet.grids().size(); g++)
{
const Nbnxm::Grid &grid = gridSet.grids()[g];
const int *cxy_ind = grid.cxy_ind().data();
const int numRealAtomsTotal = gridSet.numRealAtomsTotal();
- if (iloc == Nbnxm::InteractionLocality::Local)
- {
+ reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+ reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
- reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
- reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+ if (atomIndicesSize > 0)
+ {
+ // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+ stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
+ CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
- if (atomIndicesSize > 0)
+ if (bDoTime)
{
- // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
- stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
- CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
-
- if (bDoTime)
- {
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
- }
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ }
- copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
- if (bDoTime)
- {
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
- }
-
- stat = cudaHostUnregister((void*) atomIndices);
- CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+ if (bDoTime)
+ {
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
}
- }
- reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr);
- reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr);
+ stat = cudaHostUnregister((void*) atomIndices);
+ CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+ }
if (numColumns > 0)
{
if (bDoTime)
{
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
}
- copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+ copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
}
stat = cudaHostUnregister((void*) cxy_na);
if (bDoTime)
{
- gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
}
- copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+ copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
}
stat = cudaHostUnregister((void*) cxy_ind);
CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
}
}
+
+ // The above data is transferred on the local stream but is a
+ // dependency of the nonlocal stream (specifically the nonlocal X
+ // buf ops kernel). We therefore set a dependency to ensure
+ // that the nonlocal stream waits on the local stream here.
+ // This call records an event in the local stream:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+ // ...and this call instructs the nonlocal stream to wait on that event:
+ nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+
return;
}
int atomIndicesSize;
//! size of atom indices allocated in device buffer
int atomIndicesSize_alloc;
- //! x buf ops num of atoms (local and non-local)
- gmx::EnumerationArray<Nbnxm::AtomLocality, int *> cxy_na;
+ //! x buf ops num of atoms
+ int *cxy_na;
//! number of elements in cxy_na
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_na;
+ int ncxy_na;
//! number of elements allocated allocated in device buffer
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_na_alloc;
- //! x buf ops cell index mapping (local and non-local)
- gmx::EnumerationArray<Nbnxm::AtomLocality, int *> cxy_ind;
+ int ncxy_na_alloc;
+ //! x buf ops cell index mapping
+ int *cxy_ind;
//! number of elements in cxy_ind
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_ind;
+ int ncxy_ind;
//! number of elements allocated allocated in device buffer
- gmx::EnumerationArray<Nbnxm::AtomLocality, int > ncxy_ind_alloc;
+ int ncxy_ind_alloc;
//! parameters required for the non-bonded calc.
cu_nbparam_t *nbparam;
//! pair-list data structures (local and non-local)
is done (and the local transfer can proceed) */
cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
the local stream that need to precede the
- non-local force calculations are done
- (e.g. f buffer 0-ing, local x/q H2D) */
+ non-local force or buffer operation calculations are done
+ (e.g. f buffer 0-ing, local x/q H2D, buffer op
+ initialization in local stream that is required also
+ by nonlocal stream ) */
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
/* We are done setting up all grids, we can resize the force buffers */
nbat->resizeForceBuffers();
}
+
+ int maxNumColumns = 0;
+ for (const auto &grid : grids())
+ {
+ maxNumColumns = std::max(maxNumColumns, grid.numColumns());
+ }
+ setNumColumnsMax(maxNumColumns);
+
}
} // namespace Nbnxm
copy_mat(box_, box);
}
+ //! Returns the maximum number of columns across all grids
+ int numColumnsMax() const
+ {
+ return numColumnsMax_;
+ }
+
+ //! Sets the maximum number of columns across all grids
+ void setNumColumnsMax(int numColumnsMax)
+ {
+ numColumnsMax_ = numColumnsMax;
+ }
+
private:
//! Returns collection of the data that covers all grids
const GridSetData getGridSetData()
int numRealAtomsTotal_;
//! Working data for constructing a single grid, one entry per thread
std::vector<GridWork> gridWork_;
+ //! Maximum number of columns across all grids
+ int numColumnsMax_;
+
};
} // namespace Nbnxm
}
void
-nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality locality)
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
{
+ Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
+}
- nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
- gpu_nbv,
- locality);
-
-
+void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality)
+{
+ Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
}
/*! \endcond */
void *xPmeDevicePtr,
gmx_wallcycle *wcycle);
- //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
- void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality locality);
+ //! Init for GPU version of setup coordinates in Nbnxm
+ void atomdata_init_copy_x_to_nbat_x_gpu();
+ //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
+ void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality);
//! Returns a reference to the pairlist sets
const PairlistSets &pairlistSets() const
* Called on the NS step and performs (re-)allocations and memory copies. !*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM
/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
*/
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
void gmx_unused *xPmeDevicePtr,
Nbnxm::AtomLocality gmx_unused locality,
- const rvec gmx_unused *x) CUDA_FUNC_TERM
+ const rvec gmx_unused *x,
+ int gmx_unused gridId,
+ int gmx_unused numColumnsMax) CUDA_FUNC_TERM
+
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] interactionLocality Local or NonLocal sync point
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused *nb,
+ const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
} // namespace Nbnxm