From 32b7891450622add82abfac2f44437b3a5f6e92e Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Wed, 12 Jun 2019 08:03:27 -0700 Subject: [PATCH] Bug fix and simplification for CUDA X Buffer Ops Fixes bug when there are more than two grids (ie they don't naively map to local/nonlocal), by using a separate GPU memory space for cxy_na and cxy_ind data for each grid (it previously only had one local and one nonlocal space with overwriting). Also simpifies by now only calling init fn once per NS step to set up data for all grids (previously called for both local and nonlocal). Change-Id: Ia2b97d22324aa97dca34b05da2eca2e2090372af --- src/gromacs/mdlib/sim_util.cpp | 8 +- src/gromacs/mdrun/md.cpp | 9 +- src/gromacs/nbnxm/atomdata.cpp | 2 +- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 60 +++++---- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 119 ++++++++---------- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 22 ++-- src/gromacs/nbnxm/gridset.cpp | 8 ++ src/gromacs/nbnxm/gridset.h | 15 +++ src/gromacs/nbnxm/nbnxm.cpp | 12 +- src/gromacs/nbnxm/nbnxm.h | 6 +- src/gromacs/nbnxm/nbnxm_gpu.h | 15 ++- 11 files changed, 144 insertions(+), 132 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index baecb96ba1..ad61dee88a 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1040,7 +1040,7 @@ void do_force(FILE *fplog, if (useGpuXBufOps) { - nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local); + nbv->atomdata_init_copy_x_to_nbat_x_gpu(); } } @@ -1107,12 +1107,6 @@ void do_force(FILE *fplog, &top->excls, step, nrnb); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); wallcycle_stop(wcycle, ewcNS); - - if (useGpuXBufOps) - { - - nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal); - } } else { diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index aba6ba9ca1..b5869d6521 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -300,10 +300,6 @@ void gmx::Simulator::do_md() stateInstance = std::make_unique(); state = stateInstance.get(); - if (fr->nbv->useGpu()) - { - changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported); - } dd_init_local_state(cr->dd, state_global, state); /* Distribute the charge groups over the nodes from the master node */ @@ -346,6 +342,11 @@ void gmx::Simulator::do_md() } + if (fr->nbv->useGpu()) + { + changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported); + } + // NOTE: The global state is no longer used at this point. // But state_global is still used as temporary storage space for writing // the global state to file and potentially for replica exchange. diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 55084763f3..c1271d8bba 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1043,7 +1043,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, gpu_nbv, xPmeDevicePtr, locality, - x); + x, g, gridSet.numColumnsMax()); } } else diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 7162c9f593..a0117a5cb1 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -277,11 +277,11 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g * * As the point where the local stream tasks can be considered complete happens * at the same call point where the nonlocal stream should be synced with the - * the local, this function recrds the event if called with the local stream as + * the local, this function records the event if called with the local stream as * argument and inserts in the GPU stream a wait on the event on the nonlocal. */ -static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb, - const InteractionLocality interactionLocality) +void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t *nb, + const InteractionLocality interactionLocality) { cudaStream_t stream = nb->stream[interactionLocality]; @@ -375,7 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, This wait needs to precede any PP tasks, bonded or nonbonded, that may compute on interactions between local and nonlocal atoms. */ - insertNonlocalGpuDependency(nb, iloc); + nbnxnInsertNonlocalGpuDependency(nb, iloc); } /*! As we execute nonbonded workload in separate streams, before launching @@ -743,7 +743,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, gmx_nbnxn_gpu_t *nb, void *xPmeDevicePtr, const Nbnxm::AtomLocality locality, - const rvec *x) + const rvec *x, + int gridId, + int numColumnsMax) { cu_atomdata_t *adat = nb->atdat; bool bDoTime = nb->bDoTime; @@ -751,17 +753,11 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, const int numColumns = grid.numColumns(); const int cellOffset = grid.cellOffset(); const int numAtomsPerCell = grid.numAtomsPerCell(); - // TODO: Document this, one can not infer the interaction locality from the atom locality - Nbnxm::InteractionLocality interactionLoc = Nbnxm::InteractionLocality::Local; - int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); - int copyAtomStart = grid.srcAtomBegin(); + Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality); + int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); + int copyAtomStart = grid.srcAtomBegin(); - if (locality == Nbnxm::AtomLocality::NonLocal) - { - interactionLoc = Nbnxm::InteractionLocality::NonLocal; - } - - cudaStream_t stream = nb->stream[interactionLoc]; + cudaStream_t stream = nb->stream[interactionLoc]; // FIXME: need to either let the local stream get to the // insertNonlocalGpuDependency call or call it separately here @@ -769,7 +765,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, { if (interactionLoc == Nbnxm::InteractionLocality::Local) { - insertNonlocalGpuDependency(nb, interactionLoc); + nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } return; } @@ -820,24 +816,24 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, config.sharedMemorySize = 0; config.stream = stream; - auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel; - float *xqPtr = &(adat->xq->x); - const int *d_atomIndices = nb->atomIndices; - const int *d_cxy_na = nb->cxy_na[locality]; - const int *d_cxy_ind = nb->cxy_ind[locality]; - const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, - &numColumns, - &xqPtr, - &setFillerCoords, - &d_x, - &d_atomIndices, - &d_cxy_na, - &d_cxy_ind, - &cellOffset, - &numAtomsPerCell); + auto kernelFn = nbnxn_gpu_x_to_nbat_x_kernel; + float *xqPtr = &(adat->xq->x); + const int *d_atomIndices = nb->atomIndices; + const int *d_cxy_na = &nb->cxy_na[numColumnsMax*gridId]; + const int *d_cxy_ind = &nb->cxy_ind[numColumnsMax*gridId]; + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, + &numColumns, + &xqPtr, + &setFillerCoords, + &d_x, + &d_atomIndices, + &d_cxy_na, + &d_cxy_ind, + &cellOffset, + &numAtomsPerCell); launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs); - insertNonlocalGpuDependency(nb, interactionLoc); + nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 7af2d94cb0..fd2da0cef5 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -495,18 +495,14 @@ gpu_init(const gmx_device_info_t *deviceInfo, cuda_init_const(nb, ic, listParams, nbat->params()); - nb->natoms = 0; - nb->natoms_alloc = 0; - nb->atomIndicesSize = 0; - nb->atomIndicesSize_alloc = 0; - nb->ncxy_na[AtomLocality::Local] = 0; - nb->ncxy_na[AtomLocality::NonLocal] = 0; - nb->ncxy_na_alloc[AtomLocality::Local] = 0; - nb->ncxy_na_alloc[AtomLocality::NonLocal] = 0; - nb->ncxy_ind[AtomLocality::Local] = 0; - nb->ncxy_ind[AtomLocality::NonLocal] = 0; - nb->ncxy_ind_alloc[AtomLocality::Local] = 0; - nb->ncxy_ind_alloc[AtomLocality::NonLocal] = 0; + nb->natoms = 0; + nb->natoms_alloc = 0; + nb->atomIndicesSize = 0; + nb->atomIndicesSize_alloc = 0; + nb->ncxy_na = 0; + nb->ncxy_na_alloc = 0; + nb->ncxy_ind = 0; + nb->ncxy_ind_alloc = 0; if (debug) { @@ -874,36 +870,20 @@ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb) /* Initialization for X buffer operations on GPU. */ /* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, - gmx_nbnxn_gpu_t *gpu_nbv, - const Nbnxm::AtomLocality locality) + gmx_nbnxn_gpu_t *gpu_nbv) { cudaError_t stat; - const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ? - InteractionLocality::Local : InteractionLocality::NonLocal); - cudaStream_t stream = gpu_nbv->stream[iloc]; + cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; bool bDoTime = gpu_nbv->bDoTime; - int gridBegin = 0, gridEnd = 0; + const int maxNumColumns = gridSet.numColumnsMax(); - switch (locality) - { - case Nbnxm::AtomLocality::All: - gridBegin = 0; - gridEnd = gridSet.grids().size(); - break; - case Nbnxm::AtomLocality::Local: - gridBegin = 0; - gridEnd = 1; - break; - case Nbnxm::AtomLocality::NonLocal: - gridBegin = 1; - gridEnd = gridSet.grids().size(); - break; - case Nbnxm::AtomLocality::Count: - GMX_ASSERT(false, "Count is invalid locality specifier"); - break; - } - for (int g = gridBegin; g < gridEnd; g++) + reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(), + &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr); + reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(), + &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr); + + for (unsigned int g = 0; g < gridSet.grids().size(); g++) { const Nbnxm::Grid &grid = gridSet.grids()[g]; @@ -915,37 +895,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, const int *cxy_ind = grid.cxy_ind().data(); const int numRealAtomsTotal = gridSet.numRealAtomsTotal(); - if (iloc == Nbnxm::InteractionLocality::Local) - { + reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr); + reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr); - reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr); - reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr); + if (atomIndicesSize > 0) + { + // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced. + stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault); + CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices"); - if (atomIndicesSize > 0) + if (bDoTime) { - // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced. - stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault); - CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices"); - - if (bDoTime) - { - gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream); - } + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); + } - copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr); + copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr); - if (bDoTime) - { - gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream); - } - - stat = cudaHostUnregister((void*) atomIndices); - CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices"); + if (bDoTime) + { + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); } - } - reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr); - reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr); + stat = cudaHostUnregister((void*) atomIndices); + CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices"); + } if (numColumns > 0) { @@ -955,14 +928,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, if (bDoTime) { - gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); } - copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); + int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g]; + copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); if (bDoTime) { - gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); } stat = cudaHostUnregister((void*) cxy_na); @@ -974,20 +948,31 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, if (bDoTime) { - gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); } - copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); + destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g]; + copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); if (bDoTime) { - gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); } stat = cudaHostUnregister((void*) cxy_ind); CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind"); } } + + // The above data is transferred on the local stream but is a + // dependency of the nonlocal stream (specifically the nonlocal X + // buf ops kernel). We therefore set a dependency to ensure + // that the nonlocal stream waits on the local stream here. + // This call records an event in the local stream: + nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local); + // ...and this call instructs the nonlocal stream to wait on that event: + nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal); + return; } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index bd038c6f03..48c8776c79 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -229,18 +229,18 @@ struct gmx_nbnxn_cuda_t int atomIndicesSize; //! size of atom indices allocated in device buffer int atomIndicesSize_alloc; - //! x buf ops num of atoms (local and non-local) - gmx::EnumerationArray cxy_na; + //! x buf ops num of atoms + int *cxy_na; //! number of elements in cxy_na - gmx::EnumerationArray ncxy_na; + int ncxy_na; //! number of elements allocated allocated in device buffer - gmx::EnumerationArray ncxy_na_alloc; - //! x buf ops cell index mapping (local and non-local) - gmx::EnumerationArray cxy_ind; + int ncxy_na_alloc; + //! x buf ops cell index mapping + int *cxy_ind; //! number of elements in cxy_ind - gmx::EnumerationArray ncxy_ind; + int ncxy_ind; //! number of elements allocated allocated in device buffer - gmx::EnumerationArray ncxy_ind_alloc; + int ncxy_ind_alloc; //! parameters required for the non-bonded calc. cu_nbparam_t *nbparam; //! pair-list data structures (local and non-local) @@ -255,8 +255,10 @@ struct gmx_nbnxn_cuda_t is done (and the local transfer can proceed) */ cudaEvent_t misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in the local stream that need to precede the - non-local force calculations are done - (e.g. f buffer 0-ing, local x/q H2D) */ + non-local force or buffer operation calculations are done + (e.g. f buffer 0-ing, local x/q H2D, buffer op + initialization in local stream that is required also + by nonlocal stream ) */ /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. diff --git a/src/gromacs/nbnxm/gridset.cpp b/src/gromacs/nbnxm/gridset.cpp index 168fdd8792..feeb005594 100644 --- a/src/gromacs/nbnxm/gridset.cpp +++ b/src/gromacs/nbnxm/gridset.cpp @@ -224,6 +224,14 @@ void GridSet::putOnGrid(const matrix box, /* We are done setting up all grids, we can resize the force buffers */ nbat->resizeForceBuffers(); } + + int maxNumColumns = 0; + for (const auto &grid : grids()) + { + maxNumColumns = std::max(maxNumColumns, grid.numColumns()); + } + setNumColumnsMax(maxNumColumns); + } } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/gridset.h b/src/gromacs/nbnxm/gridset.h index e59a8584c1..a4eaf6256e 100644 --- a/src/gromacs/nbnxm/gridset.h +++ b/src/gromacs/nbnxm/gridset.h @@ -185,6 +185,18 @@ class GridSet copy_mat(box_, box); } + //! Returns the maximum number of columns across all grids + int numColumnsMax() const + { + return numColumnsMax_; + } + + //! Sets the maximum number of columns across all grids + void setNumColumnsMax(int numColumnsMax) + { + numColumnsMax_ = numColumnsMax; + } + private: //! Returns collection of the data that covers all grids const GridSetData getGridSetData() @@ -213,6 +225,9 @@ class GridSet int numRealAtomsTotal_; //! Working data for constructing a single grid, one entry per thread std::vector gridWork_; + //! Maximum number of columns across all grids + int numColumnsMax_; + }; } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 31037b2476..54c9ff9864 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -190,13 +190,13 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter, } void -nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality locality) +nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu() { + Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv); +} - nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), - gpu_nbv, - locality); - - +void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality) +{ + Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); } /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index b8bdb853cb..72eb98ae59 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -248,9 +248,11 @@ struct nonbonded_verlet_t void *xPmeDevicePtr, gmx_wallcycle *wcycle); - //! Init for GPU version of setup coordinates in Nbnxm, for the given locality - void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality locality); + //! Init for GPU version of setup coordinates in Nbnxm + void atomdata_init_copy_x_to_nbat_x_gpu(); + //! Sync the nonlocal GPU stream with dependent tasks in the local queue. + void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality); //! Returns a reference to the pairlist sets const PairlistSets &pairlistSets() const diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 49dc1bfce2..7e88129f4e 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -220,8 +220,7 @@ int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETU * Called on the NS step and performs (re-)allocations and memory copies. !*/ CUDA_FUNC_QUALIFIER void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. */ @@ -231,7 +230,17 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, void gmx_unused *xPmeDevicePtr, Nbnxm::AtomLocality gmx_unused locality, - const rvec gmx_unused *x) CUDA_FUNC_TERM + const rvec gmx_unused *x, + int gmx_unused gridId, + int gmx_unused numColumnsMax) CUDA_FUNC_TERM + +/*! \brief Sync the nonlocal stream with dependent tasks in the local queue. + * \param[in] nb The nonbonded data GPU structure + * \param[in] interactionLocality Local or NonLocal sync point + */ +CUDA_FUNC_QUALIFIER +void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused *nb, + const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM } // namespace Nbnxm -- 2.22.0