From 54c2472992b082d43bc020a0165d47a64165991e Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Tue, 3 Sep 2019 01:00:13 -0700 Subject: [PATCH] GPU Force Halo Exchange Activate with GMX_GPU_DD_COMMS environment variable. Extends GPU Halo exchange feature to provide GPU Force halo exchange functionality. Does not yet support virial steps, which require an extra shift force reduction - these are currently performed on the non-buffer ops / non direct-comm path. Also has same limitations as coordinate halo exchange. Performs part of #2890. Future work to improve synchronization towards a more one-sided scheme (#3092) and to make depenencies more explicit (#3093) Change-Id: Ifc23cc8db2655f7258e68b34e7cdc7b71994e1e8 --- src/gromacs/domdec/gpuhaloexchange.h | 36 ++++-- src/gromacs/domdec/gpuhaloexchange_impl.cpp | 9 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 126 ++++++++++++++++++-- src/gromacs/domdec/gpuhaloexchange_impl.cuh | 29 +++-- src/gromacs/mdlib/sim_util.cpp | 52 ++++++-- src/gromacs/mdrun/runner.cpp | 8 +- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 15 +++ src/gromacs/nbnxm/nbnxm.cpp | 10 ++ src/gromacs/nbnxm/nbnxm.h | 6 + src/gromacs/nbnxm/nbnxm_gpu.h | 12 ++ 10 files changed, 258 insertions(+), 45 deletions(-) mode change 100755 => 100644 src/gromacs/domdec/gpuhaloexchange_impl.cpp diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index 58f908b76a..4187e45cc2 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -60,22 +60,31 @@ class GpuHaloExchange public: /*! \brief Creates GPU Halo Exchange object. * - * Halo exchange will be performed in \c streamNonLocal, and - * the main communicateHaloCoordinates method must be called - * before any subsequent operations that access non-local - * parts of the coordinate buffer (such as the non-local - * non-bonded kernels). It also must be called after the local - * coordinates buffer operations (where the coordinates are - * copied to the device and hence the \c - * coordinatesOnDeviceEvent is recorded). + * Coordinate Halo exchange will be performed in \c + * StreamNonLocal, and the \c communicateHaloCoordinates + * method must be called before any subsequent operations that + * access non-local parts of the coordinate buffer (such as + * the non-local non-bonded kernels). It also must be called + * after the local coordinates buffer operations (where the + * coordinates are copied to the device and hence the \c + * coordinatesOnDeviceEvent is recorded). Force Halo exchange + * will be performed in \c streamNonLocal (also potentally + * with buffer clearing in \c streamLocal)and the \c + * communicateHaloForces method must be called after the + * non-local buffer operations, after the local force buffer + * has been copied to the GPU (if CPU forces are present), and + * before the local buffer operations. The force halo exchange + * does not yet support virial steps. * * \param [inout] dd domdec structure * \param [in] mpi_comm_mysim communicator used for simulation + * \param [in] streamLocal local NB CUDA stream. * \param [in] streamNonLocal non-local NB CUDA stream. * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device */ GpuHaloExchange(gmx_domdec_t *dd, MPI_Comm mpi_comm_mysim, + void *streamLocal, void *streamNonLocal, void *coordinatesOnDeviceEvent); ~GpuHaloExchange(); @@ -84,9 +93,10 @@ class GpuHaloExchange * * Initialization for GPU halo exchange of coordinates buffer * \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory + * \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory */ - void reinitHalo(rvec *d_coordinateBuffer); - + void reinitHalo(rvec *d_coordinateBuffer, + rvec *d_forcesBuffer); /*! \brief GPU halo exchange of coordinates buffer. * @@ -97,6 +107,12 @@ class GpuHaloExchange */ void communicateHaloCoordinates(const matrix box); + /*! \brief GPU halo exchange of force buffer. + * \param[in] accumulateForces True if forces should accumulate, otherwise they are set + */ + void communicateHaloForces(bool accumulateForces); + + private: class Impl; gmx::PrivateImplPointer impl_; diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp old mode 100755 new mode 100644 index 4b491eac36..741c79519c --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -62,6 +62,7 @@ class GpuHaloExchange::Impl /*!\brief Constructor stub. */ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */, MPI_Comm /* mpi_comm_mysim */, + void * /*streamLocal */, void * /*streamNonLocal */, void * /*coordinatesOnDeviceEvent*/) : impl_(nullptr) @@ -72,7 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */, GpuHaloExchange::~GpuHaloExchange() = default; /*!\brief init halo exhange stub. */ -void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */) +void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */, + rvec * /* d_forcesBuffer */) { GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); } @@ -83,6 +85,11 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */) GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation."); } +/*!\brief apply F halo exchange stub. */ +void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces) +{ + GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); +} } // namespace gmx diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 4e8ba35dd4..0b024fb0bd 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -37,7 +37,7 @@ * \brief Implements GPU halo exchange using CUDA. * * - * \author Alan Gray + * \author Alan Gray * * \ingroup module_domdec */ @@ -95,10 +95,44 @@ __global__ void packSendBufKernel(float3 * __restrict__ dataPacked, return; } -void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer) +/*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index information + * \param[out] data full array of force values + * \param[in] dataPacked packed array of force values to be transferred + * \param[in] map array of indices defining mapping from full to packed array + * \param[in] mapSize number of elements in map array + */ +template +__global__ void unpackRecvBufKernel(float3 * __restrict__ data, + const float3 * __restrict__ dataPacked, + const int * __restrict__ map, + const int mapSize) +{ + + int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; + const float3 *gm_dataSrc = &dataPacked[threadIndex]; + float3 *gm_dataDest = &data[map[threadIndex]]; + + if (threadIndex < mapSize) + { + if (accumulate) + { + *gm_dataDest += *gm_dataSrc; + } + else + { + *gm_dataDest = *gm_dataSrc; + } + } + + return; +} + +void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer, + float3 *d_forcesBuffer) { d_x_ = d_coordinatesBuffer; + d_f_ = d_forcesBuffer; cudaStream_t stream = nonLocalStream_; int nzone = 1; @@ -128,7 +162,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer) fSendSize_ = xRecvSize_; fRecvSize_ = xSendSize_; - localOffset_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank + numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch"); std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); @@ -141,7 +175,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer) // since the pointers will not change until the next NS step. //Coordinates buffer: - void* recvPtr = static_cast (&d_coordinatesBuffer[localOffset_]); + void* recvPtr = static_cast (&d_coordinatesBuffer[numHomeAtoms_]); MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); @@ -208,6 +242,58 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box) return; } +// The following method should be called after non-local buffer operations, +// and before the local buffer operations. It operates in the non-local stream. +void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) +{ + + // Communicate halo data (in non-local stream) + communicateHaloData(d_f_, HaloQuantity::HaloForces); + + float3 *d_f = d_f_; + + if (!accumulateForces) + { + //Clear local portion of force array (in local stream) + cudaMemsetAsync(d_f, 0, numHomeAtoms_*sizeof(rvec), localStream_); + } + + // ensure non-local stream waits for local stream, due to dependence on + // the previous H2D copy of CPU forces (if accumulateForces is true) + // or the above clearing. + // TODO remove this dependency on localStream - edmine issue #3093 + GpuEventSynchronizer eventLocal; + eventLocal.markEvent(localStream_); + eventLocal.enqueueWaitEvent(nonLocalStream_); + + //Unpack halo buffer into force array + + KernelLaunchConfig config; + config.blockSize[0] = c_threadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = (fRecvSize_+c_threadsPerBlock-1)/c_threadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + config.sharedMemorySize = 0; + config.stream = nonLocalStream_; + + const float3 *recvBuf = d_recvBuf_; + const int *indexMap = d_indexMap_; + const int size = fRecvSize_; + + if (size > 0) + { + auto kernelFn = accumulateForces ? unpackRecvBufKernel : unpackRecvBufKernel; + + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_f, + &recvBuf, &indexMap, + &size); + + launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); + } +} + void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr, HaloQuantity haloQuantity) @@ -218,6 +304,7 @@ void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr, void * remotePtr; int sendRank; int recvRank; + if (haloQuantity == HaloQuantity::HaloCoordinates) { sendPtr = static_cast (d_sendBuf_); @@ -225,10 +312,16 @@ void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr, remotePtr = remoteXPtr_; sendRank = sendRankX_; recvRank = recvRankX_; + + //Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task + char thisTaskIsReady, remoteTaskIsReady; + MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0, + &remoteTaskIsReady, sizeof(char), MPI_BYTE, sendRank, 0, + mpi_comm_mysim_, MPI_STATUS_IGNORE); } else { - sendPtr = static_cast (&(d_ptr[localOffset_])); + sendPtr = static_cast (&(d_ptr[numHomeAtoms_])); sendSize = fSendSize_; remotePtr = remoteFPtr_; sendRank = sendRankF_; @@ -238,7 +331,6 @@ void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr, communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank); } - void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr, int sendSize, int sendRank, @@ -247,6 +339,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr, { cudaError_t stat; + cudaStream_t stream = nonLocalStream_; // We asynchronously push data to remote rank. The remote // destination pointer has already been set in the init fn. We @@ -257,7 +350,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr, // send data to neighbor, if any data exists to send if (sendSize > 0) { - stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_); + stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, stream); CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed"); } @@ -267,19 +360,20 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr, // its stream. GpuEventSynchronizer *haloDataTransferRemote; - haloDataTransferLaunched_->markEvent(nonLocalStream_); + haloDataTransferLaunched_->markEvent(stream); MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0, &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); - haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_); + haloDataTransferRemote->enqueueWaitEvent(stream); } /*! \brief Create Domdec GPU object */ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd, MPI_Comm mpi_comm_mysim, + void * localStream, void * nonLocalStream, void * coordinatesOnDeviceEvent) : dd_(dd), @@ -290,6 +384,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd, usePBC_(dd->ci[dd->dim[0]] == 0), haloDataTransferLaunched_(new GpuEventSynchronizer()), mpi_comm_mysim_(mpi_comm_mysim), + localStream_(*static_cast (localStream)), nonLocalStream_(*static_cast (nonLocalStream)), coordinatesOnDeviceEvent_(static_cast (coordinatesOnDeviceEvent)) { @@ -323,17 +418,19 @@ GpuHaloExchange::Impl::~Impl() GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd, MPI_Comm mpi_comm_mysim, + void *localStream, void *nonLocalStream, void *coordinatesOnDeviceEvent) - : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent)) + : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent)) { } GpuHaloExchange::~GpuHaloExchange() = default; -void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer) +void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer, + rvec *d_forcesBuffer) { - impl_->reinitHalo(reinterpret_cast(d_coordinatesBuffer)); + impl_->reinitHalo(reinterpret_cast(d_coordinatesBuffer), reinterpret_cast(d_forcesBuffer)); } void GpuHaloExchange::communicateHaloCoordinates(const matrix box) @@ -341,4 +438,9 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix box) impl_->communicateHaloCoordinates(box); } +void GpuHaloExchange::communicateHaloForces(bool accumulateForces) +{ + impl_->communicateHaloForces(accumulateForces); +} + } //namespace gmx diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index bd125654ee..10d9118927 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -68,11 +68,13 @@ class GpuHaloExchange::Impl * * \param [inout] dd domdec structure * \param [in] mpi_comm_mysim communicator used for simulation + * \param [in] localStream local NB CUDA stream * \param [in] nonLocalStream non-local NB CUDA stream * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device */ Impl(gmx_domdec_t *dd, MPI_Comm mpi_comm_mysim, + void *localStream, void *nonLocalStream, void *coordinatesOnDeviceEvent); ~Impl(); @@ -80,8 +82,10 @@ class GpuHaloExchange::Impl /*! \brief * (Re-) Initialization for GPU halo exchange * \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory + * \param [in] d_forcesBuffer pointer to forces buffer in GPU memory */ - void reinitHalo(float3 *d_coordinatesBuffer); + void reinitHalo(float3 *d_coordinatesBuffer, + float3 *d_forcesBuffer); /*! \brief @@ -90,6 +94,11 @@ class GpuHaloExchange::Impl */ void communicateHaloCoordinates(const matrix box); + /*! \brief GPU halo exchange of force buffer + * \param[in] accumulateForces True if forces should accumulate, otherwise they are set + */ + void communicateHaloForces(bool accumulateForces); + private: /*! \brief Data transfer wrapper for GPU halo exchange @@ -106,11 +115,11 @@ class GpuHaloExchange::Impl * \param [inout] remotePtr remote address to recv data * \param [in] recvRank rank to recv data from */ - void communicateHaloDataWithCudaDirect(void *sendPtr, - int sendSize, - int sendRank, - void* remotePtr, - int recvRank); + void communicateHaloDataWithCudaDirect(void *sendPtr, + int sendSize, + int sendRank, + void * remotePtr, + int recvRank); //! Domain decomposition object gmx_domdec_t *dd_ = nullptr; @@ -152,8 +161,8 @@ class GpuHaloExchange::Impl int fSendSize_ = 0; //! recv copy size to this rank for F int fRecvSize_ = 0; - //! offset of local halo region - int localOffset_ = 0; + //! number of home atoms - offset of local halo region + int numHomeAtoms_ = 0; //! remote GPU coordinates buffer pointer for pushing data void *remoteXPtr_ = 0; //! remote GPU force buffer pointer for pushing data @@ -166,12 +175,16 @@ class GpuHaloExchange::Impl GpuEventSynchronizer *haloDataTransferLaunched_ = nullptr; //! MPI communicator used for simulation MPI_Comm mpi_comm_mysim_; + //! CUDA stream for local non-bonded calculations + cudaStream_t localStream_ = nullptr; //! CUDA stream for non-local non-bonded calculations cudaStream_t nonLocalStream_ = nullptr; //! Event triggered when coordinates have been copied to device GpuEventSynchronizer *coordinatesOnDeviceEvent_ = nullptr; //! full coordinates buffer in GPU memory float3 *d_x_ = nullptr; + //! full forces buffer in GPU memory + float3 *d_f_ = nullptr; }; diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index aba304222f..881b9974ac 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1210,7 +1210,8 @@ void do_force(FILE *fplog, if (ddUsesGpuDirectCommunication) { rvec* d_x = static_cast (nbv->get_gpu_xrvec()); - gpuHaloExchange->reinitHalo(d_x); + rvec* d_f = static_cast (nbv->get_gpu_frvec()); + gpuHaloExchange->reinitHalo(d_x, d_f); } } else @@ -1519,6 +1520,12 @@ void do_force(FILE *fplog, } } + const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True); + const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction; + // TODO: move this into DomainLifetimeWorkload, including the second part of the condition + const bool haveCpuLocalForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction || + (fr->efep != efepNO)); + if (havePPDomainDecomposition(cr)) { /* We are done with the CPU compute. @@ -1530,11 +1537,27 @@ void do_force(FILE *fplog, if (forceFlags.computeForces) { - if (useGpuFBufOps == BufferOpsUseGpu::True) + gmx::ArrayRef force = forceOut.forceWithShiftForces().force(); + rvec *f = as_rvec_array(force.data()); + + if (useGpuForcesHaloExchange) { - nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal); + if (haveCpuLocalForces) + { + nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local); + } + bool accumulateHaloForces = haveCpuLocalForces; + gpuHaloExchange->communicateHaloForces(accumulateHaloForces); } - dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle); + else + { + if (useGpuFBufOps == BufferOpsUseGpu::True) + { + nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal); + } + dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle); + } + } } @@ -1604,12 +1627,6 @@ void do_force(FILE *fplog, { gmx::ArrayRef forceWithShift = forceOut.forceWithShiftForces().force(); - - const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction; - // TODO: move this into DomainLifetimeWorkload, including the second part of the condition - const bool haveCpuLocalForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction || - (fr->efep != efepNO)); - if (useGpuFBufOps == BufferOpsUseGpu::True) { // Flag to specify whether the CPU force buffer has contributions to @@ -1622,12 +1639,23 @@ void do_force(FILE *fplog, // - CPU f H2D should be as soon as all CPU-side forces are done // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait // before the next CPU task that consumes the forces: vsite spread or update) - // + // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result + // of the halo exchange. In that case the copy is instead performed above, before the exchange. + // These should be unified. rvec *f = as_rvec_array(forceWithShift.data()); - if (haveLocalForceContribInCpuBuffer) + if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange) { nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local); } + if (useGpuForcesHaloExchange) + { + // Add a stream synchronization to satisfy a dependency + // for the local buffer ops on the result of GPU halo + // exchange, which operates in the non-local stream and + // writes to to local parf og the force buffer. + // TODO improve this through use of an event - see Redmine #3093 + nbv->stream_local_wait_for_nonlocal(); + } nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local, nbv->getDeviceForces(), pme_gpu_get_device_f(fr->pmedata), diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index acca236d2b..8d2f549c9d 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1337,11 +1337,15 @@ int Mdrunner::mdrunner() // TODO Move this to happen during domain decomposition setup, // once stream and event handling works well with that. + // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093 if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded) { - void *stream = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); + void *streamLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); + void *streamNonLocal = + Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event(); - cr->dd->gpuHaloExchange = std::make_unique(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent); + cr->dd->gpuHaloExchange = std::make_unique(cr->dd, cr->mpi_comm_mysim, streamLocal, + streamNonLocal, coordinatesOnDeviceEvent); } /* Initialize the mdAtoms structure. diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 9715d2985f..05c0278abc 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -1074,6 +1074,11 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv) return static_cast (gpu_nbv->xrvec); } +void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv) +{ + return static_cast (gpu_nbv->frvec); +} + void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb) { return static_cast (nb->xAvailableOnDevice); @@ -1084,4 +1089,14 @@ void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t *nb) nb->xNonLocalCopyD2HDone->waitForEvent(); } +void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t *nb) +{ + cudaStream_t localStream = nb->stream[InteractionLocality::Local]; + cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal]; + + GpuEventSynchronizer event; + event.markEvent(nonLocalStream); + event.enqueueWaitEvent(localStream); +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 031033dea9..892098ae4a 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -330,4 +330,14 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done() Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv); } +void* nonbonded_verlet_t::get_gpu_frvec() +{ + return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv); +} + +void nonbonded_verlet_t::stream_local_wait_for_nonlocal() +{ + Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv); +} + /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 7a27a5cb4e..caa21d9c7f 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -406,6 +406,12 @@ struct nonbonded_verlet_t /*! \brief Wait for non-local copy of coordinate buffer from device to host */ void wait_nonlocal_x_copy_D2H_done(); + /*! \brief return GPU pointer to f in rvec format */ + void* get_gpu_frvec(); + + /*! \brief Ensure local stream waits for non-local stream */ + void stream_local_wait_for_nonlocal(); + //! Return the kernel setup const Nbnxm::KernelSetup &kernelSetup() const { diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 743bcffa20..635b9d9790 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -411,5 +411,17 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WIT CUDA_FUNC_QUALIFIER void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; +/*! \brief return GPU pointer to f in rvec format + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief Ensure local stream waits for non-local stream + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; + } // namespace Nbnxm #endif -- 2.22.0