Activate with GMX_GPU_DD_COMMS environment variable.
Extends GPU Halo exchange feature to provide GPU Force halo exchange
functionality. Does not yet support virial steps, which require an
extra shift force reduction - these are currently performed on the
non-buffer ops / non direct-comm path. Also has same limitations as
coordinate halo exchange.
Performs part of #2890. Future work to improve synchronization towards
a more one-sided scheme (#3092) and to make depenencies more
explicit (#3093)
Change-Id: Ifc23cc8db2655f7258e68b34e7cdc7b71994e1e8
public:
/*! \brief Creates GPU Halo Exchange object.
*
- * Halo exchange will be performed in \c streamNonLocal, and
- * the main communicateHaloCoordinates method must be called
- * before any subsequent operations that access non-local
- * parts of the coordinate buffer (such as the non-local
- * non-bonded kernels). It also must be called after the local
- * coordinates buffer operations (where the coordinates are
- * copied to the device and hence the \c
- * coordinatesOnDeviceEvent is recorded).
+ * Coordinate Halo exchange will be performed in \c
+ * StreamNonLocal, and the \c communicateHaloCoordinates
+ * method must be called before any subsequent operations that
+ * access non-local parts of the coordinate buffer (such as
+ * the non-local non-bonded kernels). It also must be called
+ * after the local coordinates buffer operations (where the
+ * coordinates are copied to the device and hence the \c
+ * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+ * will be performed in \c streamNonLocal (also potentally
+ * with buffer clearing in \c streamLocal)and the \c
+ * communicateHaloForces method must be called after the
+ * non-local buffer operations, after the local force buffer
+ * has been copied to the GPU (if CPU forces are present), and
+ * before the local buffer operations. The force halo exchange
+ * does not yet support virial steps.
*
* \param [inout] dd domdec structure
* \param [in] mpi_comm_mysim communicator used for simulation
+ * \param [in] streamLocal local NB CUDA stream.
* \param [in] streamNonLocal non-local NB CUDA stream.
* \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
*/
GpuHaloExchange(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
+ void *streamLocal,
void *streamNonLocal,
void *coordinatesOnDeviceEvent);
~GpuHaloExchange();
*
* Initialization for GPU halo exchange of coordinates buffer
* \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory
+ * \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory
*/
- void reinitHalo(rvec *d_coordinateBuffer);
-
+ void reinitHalo(rvec *d_coordinateBuffer,
+ rvec *d_forcesBuffer);
/*! \brief GPU halo exchange of coordinates buffer.
*
*/
void communicateHaloCoordinates(const matrix box);
+ /*! \brief GPU halo exchange of force buffer.
+ * \param[in] accumulateForces True if forces should accumulate, otherwise they are set
+ */
+ void communicateHaloForces(bool accumulateForces);
+
+
private:
class Impl;
gmx::PrivateImplPointer<Impl> impl_;
/*!\brief Constructor stub. */
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
MPI_Comm /* mpi_comm_mysim */,
+ void * /*streamLocal */,
void * /*streamNonLocal */,
void * /*coordinatesOnDeviceEvent*/)
: impl_(nullptr)
GpuHaloExchange::~GpuHaloExchange() = default;
/*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
+ rvec * /* d_forcesBuffer */)
{
GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}
GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
}
+/*!\brief apply F halo exchange stub. */
+void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+{
+ GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
} // namespace gmx
* \brief Implements GPU halo exchange using CUDA.
*
*
- * \author Alan Gray <alang@nvidia.com.com>
+ * \author Alan Gray <alang@nvidia.com>
*
* \ingroup module_domdec
*/
return;
}
-void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer)
+/*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index information
+ * \param[out] data full array of force values
+ * \param[in] dataPacked packed array of force values to be transferred
+ * \param[in] map array of indices defining mapping from full to packed array
+ * \param[in] mapSize number of elements in map array
+ */
+template <bool accumulate>
+__global__ void unpackRecvBufKernel(float3 * __restrict__ data,
+ const float3 * __restrict__ dataPacked,
+ const int * __restrict__ map,
+ const int mapSize)
+{
+
+ int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
+ const float3 *gm_dataSrc = &dataPacked[threadIndex];
+ float3 *gm_dataDest = &data[map[threadIndex]];
+
+ if (threadIndex < mapSize)
+ {
+ if (accumulate)
+ {
+ *gm_dataDest += *gm_dataSrc;
+ }
+ else
+ {
+ *gm_dataDest = *gm_dataSrc;
+ }
+ }
+
+ return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer,
+ float3 *d_forcesBuffer)
{
d_x_ = d_coordinatesBuffer;
+ d_f_ = d_forcesBuffer;
cudaStream_t stream = nonLocalStream_;
int nzone = 1;
fSendSize_ = xRecvSize_;
fRecvSize_ = xSendSize_;
- localOffset_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank
+ numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank
GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
// since the pointers will not change until the next NS step.
//Coordinates buffer:
- void* recvPtr = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+ void* recvPtr = static_cast<void*> (&d_coordinatesBuffer[numHomeAtoms_]);
MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
&remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
mpi_comm_mysim_, MPI_STATUS_IGNORE);
return;
}
+// The following method should be called after non-local buffer operations,
+// and before the local buffer operations. It operates in the non-local stream.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+{
+
+ // Communicate halo data (in non-local stream)
+ communicateHaloData(d_f_, HaloQuantity::HaloForces);
+
+ float3 *d_f = d_f_;
+
+ if (!accumulateForces)
+ {
+ //Clear local portion of force array (in local stream)
+ cudaMemsetAsync(d_f, 0, numHomeAtoms_*sizeof(rvec), localStream_);
+ }
+
+ // ensure non-local stream waits for local stream, due to dependence on
+ // the previous H2D copy of CPU forces (if accumulateForces is true)
+ // or the above clearing.
+ // TODO remove this dependency on localStream - edmine issue #3093
+ GpuEventSynchronizer eventLocal;
+ eventLocal.markEvent(localStream_);
+ eventLocal.enqueueWaitEvent(nonLocalStream_);
+
+ //Unpack halo buffer into force array
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_threadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = (fRecvSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+ config.gridSize[1] = 1;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+ config.stream = nonLocalStream_;
+
+ const float3 *recvBuf = d_recvBuf_;
+ const int *indexMap = d_indexMap_;
+ const int size = fRecvSize_;
+
+ if (size > 0)
+ {
+ auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
+
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_f,
+ &recvBuf, &indexMap,
+ &size);
+
+ launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+ }
+}
+
void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr,
HaloQuantity haloQuantity)
void * remotePtr;
int sendRank;
int recvRank;
+
if (haloQuantity == HaloQuantity::HaloCoordinates)
{
sendPtr = static_cast<void*> (d_sendBuf_);
remotePtr = remoteXPtr_;
sendRank = sendRankX_;
recvRank = recvRankX_;
+
+ //Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
+ char thisTaskIsReady, remoteTaskIsReady;
+ MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0,
+ &remoteTaskIsReady, sizeof(char), MPI_BYTE, sendRank, 0,
+ mpi_comm_mysim_, MPI_STATUS_IGNORE);
}
else
{
- sendPtr = static_cast<void*> (&(d_ptr[localOffset_]));
+ sendPtr = static_cast<void*> (&(d_ptr[numHomeAtoms_]));
sendSize = fSendSize_;
remotePtr = remoteFPtr_;
sendRank = sendRankF_;
communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
}
-
void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
int sendSize,
int sendRank,
{
cudaError_t stat;
+ cudaStream_t stream = nonLocalStream_;
// We asynchronously push data to remote rank. The remote
// destination pointer has already been set in the init fn. We
// send data to neighbor, if any data exists to send
if (sendSize > 0)
{
- stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+ stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, stream);
CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
}
// its stream.
GpuEventSynchronizer *haloDataTransferRemote;
- haloDataTransferLaunched_->markEvent(nonLocalStream_);
+ haloDataTransferLaunched_->markEvent(stream);
MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
&haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
mpi_comm_mysim_, MPI_STATUS_IGNORE);
- haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+ haloDataTransferRemote->enqueueWaitEvent(stream);
}
/*! \brief Create Domdec GPU object */
GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
+ void * localStream,
void * nonLocalStream,
void * coordinatesOnDeviceEvent)
: dd_(dd),
usePBC_(dd->ci[dd->dim[0]] == 0),
haloDataTransferLaunched_(new GpuEventSynchronizer()),
mpi_comm_mysim_(mpi_comm_mysim),
+ localStream_(*static_cast<cudaStream_t*> (localStream)),
nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
{
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
+ void *localStream,
void *nonLocalStream,
void *coordinatesOnDeviceEvent)
- : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+ : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
{
}
GpuHaloExchange::~GpuHaloExchange() = default;
-void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer)
+void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer,
+ rvec *d_forcesBuffer)
{
- impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+ impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
}
void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
impl_->communicateHaloCoordinates(box);
}
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+{
+ impl_->communicateHaloForces(accumulateForces);
+}
+
} //namespace gmx
*
* \param [inout] dd domdec structure
* \param [in] mpi_comm_mysim communicator used for simulation
+ * \param [in] localStream local NB CUDA stream
* \param [in] nonLocalStream non-local NB CUDA stream
* \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
*/
Impl(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
+ void *localStream,
void *nonLocalStream,
void *coordinatesOnDeviceEvent);
~Impl();
/*! \brief
* (Re-) Initialization for GPU halo exchange
* \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory
+ * \param [in] d_forcesBuffer pointer to forces buffer in GPU memory
*/
- void reinitHalo(float3 *d_coordinatesBuffer);
+ void reinitHalo(float3 *d_coordinatesBuffer,
+ float3 *d_forcesBuffer);
/*! \brief
*/
void communicateHaloCoordinates(const matrix box);
+ /*! \brief GPU halo exchange of force buffer
+ * \param[in] accumulateForces True if forces should accumulate, otherwise they are set
+ */
+ void communicateHaloForces(bool accumulateForces);
+
private:
/*! \brief Data transfer wrapper for GPU halo exchange
* \param [inout] remotePtr remote address to recv data
* \param [in] recvRank rank to recv data from
*/
- void communicateHaloDataWithCudaDirect(void *sendPtr,
- int sendSize,
- int sendRank,
- void* remotePtr,
- int recvRank);
+ void communicateHaloDataWithCudaDirect(void *sendPtr,
+ int sendSize,
+ int sendRank,
+ void * remotePtr,
+ int recvRank);
//! Domain decomposition object
gmx_domdec_t *dd_ = nullptr;
int fSendSize_ = 0;
//! recv copy size to this rank for F
int fRecvSize_ = 0;
- //! offset of local halo region
- int localOffset_ = 0;
+ //! number of home atoms - offset of local halo region
+ int numHomeAtoms_ = 0;
//! remote GPU coordinates buffer pointer for pushing data
void *remoteXPtr_ = 0;
//! remote GPU force buffer pointer for pushing data
GpuEventSynchronizer *haloDataTransferLaunched_ = nullptr;
//! MPI communicator used for simulation
MPI_Comm mpi_comm_mysim_;
+ //! CUDA stream for local non-bonded calculations
+ cudaStream_t localStream_ = nullptr;
//! CUDA stream for non-local non-bonded calculations
cudaStream_t nonLocalStream_ = nullptr;
//! Event triggered when coordinates have been copied to device
GpuEventSynchronizer *coordinatesOnDeviceEvent_ = nullptr;
//! full coordinates buffer in GPU memory
float3 *d_x_ = nullptr;
+ //! full forces buffer in GPU memory
+ float3 *d_f_ = nullptr;
};
if (ddUsesGpuDirectCommunication)
{
rvec* d_x = static_cast<rvec *> (nbv->get_gpu_xrvec());
- gpuHaloExchange->reinitHalo(d_x);
+ rvec* d_f = static_cast<rvec *> (nbv->get_gpu_frvec());
+ gpuHaloExchange->reinitHalo(d_x, d_f);
}
}
else
}
}
+ const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+ const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+ // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
+ const bool haveCpuLocalForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+ (fr->efep != efepNO));
+
if (havePPDomainDecomposition(cr))
{
/* We are done with the CPU compute.
if (forceFlags.computeForces)
{
- if (useGpuFBufOps == BufferOpsUseGpu::True)
+ gmx::ArrayRef<gmx::RVec> force = forceOut.forceWithShiftForces().force();
+ rvec *f = as_rvec_array(force.data());
+
+ if (useGpuForcesHaloExchange)
{
- nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+ if (haveCpuLocalForces)
+ {
+ nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+ }
+ bool accumulateHaloForces = haveCpuLocalForces;
+ gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
}
- dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+ else
+ {
+ if (useGpuFBufOps == BufferOpsUseGpu::True)
+ {
+ nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+ }
+ dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+ }
+
}
}
{
gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
-
- const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
- // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
- const bool haveCpuLocalForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
- (fr->efep != efepNO));
-
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
// Flag to specify whether the CPU force buffer has contributions to
// - CPU f H2D should be as soon as all CPU-side forces are done
// - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
// before the next CPU task that consumes the forces: vsite spread or update)
- //
+ // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
+ // of the halo exchange. In that case the copy is instead performed above, before the exchange.
+ // These should be unified.
rvec *f = as_rvec_array(forceWithShift.data());
- if (haveLocalForceContribInCpuBuffer)
+ if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
{
nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
}
+ if (useGpuForcesHaloExchange)
+ {
+ // Add a stream synchronization to satisfy a dependency
+ // for the local buffer ops on the result of GPU halo
+ // exchange, which operates in the non-local stream and
+ // writes to to local parf og the force buffer.
+ // TODO improve this through use of an event - see Redmine #3093
+ nbv->stream_local_wait_for_nonlocal();
+ }
nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
nbv->getDeviceForces(),
pme_gpu_get_device_f(fr->pmedata),
// TODO Move this to happen during domain decomposition setup,
// once stream and event handling works well with that.
+ // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
{
- void *stream = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+ void *streamLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+ void *streamNonLocal =
+ Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
- cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+ cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
+ streamNonLocal, coordinatesOnDeviceEvent);
}
/* Initialize the mdAtoms structure.
return static_cast<void *> (gpu_nbv->xrvec);
}
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+ return static_cast<void *> (gpu_nbv->frvec);
+}
+
void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb)
{
return static_cast<void*> (nb->xAvailableOnDevice);
nb->xNonLocalCopyD2HDone->waitForEvent();
}
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t *nb)
+{
+ cudaStream_t localStream = nb->stream[InteractionLocality::Local];
+ cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
+
+ GpuEventSynchronizer event;
+ event.markEvent(nonLocalStream);
+ event.enqueueWaitEvent(localStream);
+}
+
} // namespace Nbnxm
Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
}
+void* nonbonded_verlet_t::get_gpu_frvec()
+{
+ return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
+}
+
+void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
+{
+ Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
+}
+
/*! \endcond */
/*! \brief Wait for non-local copy of coordinate buffer from device to host */
void wait_nonlocal_x_copy_D2H_done();
+ /*! \brief return GPU pointer to f in rvec format */
+ void* get_gpu_frvec();
+
+ /*! \brief Ensure local stream waits for non-local stream */
+ void stream_local_wait_for_nonlocal();
+
//! Return the kernel setup
const Nbnxm::KernelSetup &kernelSetup() const
{
CUDA_FUNC_QUALIFIER
void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
+/*! \brief return GPU pointer to f in rvec format
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Ensure local stream waits for non-local stream
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
+
} // namespace Nbnxm
#endif