copyToDeviceBuffer(
&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
}
- // This rank will push data to its neighbor, so needs to know
- // the remote receive address and similarly send its receive
- // address to other neighbour. We can do this here in reinit fn
- // since the pointers will not change until the next NS step.
- // Coordinates buffer:
- void* recvPtr = static_cast<void*>(&d_x_[atomOffset_]);
#if GMX_MPI
- MPI_Sendrecv(&recvPtr,
- sizeof(void*),
+ // Exchange of remote addresses from neighboring ranks is needed only with CUDA-direct as cudamemcpy needs both src/dst pointer
+ // MPI calls such as MPI_send doesn't worry about receiving address, that is taken care by MPI_recv call in neighboring rank
+ if (GMX_THREAD_MPI)
+ {
+ // This rank will push data to its neighbor, so needs to know
+ // the remote receive address and similarly send its receive
+ // address to other neighbour. We can do this here in reinit fn
+ // since the pointers will not change until the next NS step.
+
+ // Coordinates buffer:
+ float3* recvPtr = &d_x_[atomOffset_];
+ MPI_Sendrecv(&recvPtr,
+ sizeof(void*),
+ MPI_BYTE,
+ recvRankX_,
+ 0,
+ &remoteXPtr_,
+ sizeof(void*),
+ MPI_BYTE,
+ sendRankX_,
+ 0,
+ mpi_comm_mysim_,
+ MPI_STATUS_IGNORE);
+
+ // Force buffer:
+ recvPtr = d_recvBuf_;
+ MPI_Sendrecv(&recvPtr,
+ sizeof(void*),
+ MPI_BYTE,
+ recvRankF_,
+ 0,
+ &remoteFPtr_,
+ sizeof(void*),
+ MPI_BYTE,
+ sendRankF_,
+ 0,
+ mpi_comm_mysim_,
+ MPI_STATUS_IGNORE);
+ }
+#endif
+
+ wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
+ wallcycle_stop(wcycle_, ewcDOMDEC);
+
+ return;
+}
+
+void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+ GMX_ASSERT(coordinatesReadyOnDeviceEvent != nullptr,
+ "Co-ordinate Halo exchange requires valid co-ordinate ready event");
+
+ // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
+ // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
+ // Similarly send event to task that will push data to this task.
+ GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
+ MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
+ sizeof(GpuEventSynchronizer*),
MPI_BYTE,
recvRankX_,
0,
- &remoteXPtr_,
- sizeof(void*),
+ &remoteCoordinatesReadyOnDeviceEvent,
+ sizeof(GpuEventSynchronizer*),
MPI_BYTE,
sendRankX_,
0,
mpi_comm_mysim_,
MPI_STATUS_IGNORE);
-
- // Force buffer:
- recvPtr = static_cast<void*>(d_recvBuf_);
- MPI_Sendrecv(&recvPtr,
- sizeof(void*),
- MPI_BYTE,
- recvRankF_,
- 0,
- &remoteFPtr_,
- sizeof(void*),
- MPI_BYTE,
- sendRankF_,
- 0,
- mpi_comm_mysim_,
- MPI_STATUS_IGNORE);
-#endif
-
- wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
- wallcycle_stop(wcycle_, ewcDOMDEC);
-
- return;
+ remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
}
void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
// ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
wallcycle_start(wcycle_, ewcMOVEX);
- communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
+ // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
+ // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
+ if (GMX_THREAD_MPI && pulse_ == 0)
+ {
+ enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
+ }
+
+ float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
+ communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
wallcycle_stop(wcycle_, ewcMOVEX);
// ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
wallcycle_start(wcycle_, ewcMOVEF);
+ float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
+
// Communicate halo data (in non-local stream)
- communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
+ communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
wallcycle_stop(wcycle_, ewcMOVEF);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
-
-void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr,
- HaloQuantity haloQuantity,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
+ int sendSize,
+ int sendRank,
+ float3* recvPtr,
+ int recvSize,
+ int recvRank)
{
-
- void* sendPtr;
- int sendSize;
- void* remotePtr;
- int sendRank;
- int recvRank;
-
- if (haloQuantity == HaloQuantity::HaloCoordinates)
+ if (GMX_THREAD_MPI)
{
- sendPtr = static_cast<void*>(d_sendBuf_);
- sendSize = xSendSize_;
- remotePtr = remoteXPtr_;
- sendRank = sendRankX_;
- recvRank = recvRankX_;
-
-#if GMX_MPI
- // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
- // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
- // Similarly send event to task that will push data to this task.
- GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
- MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
- sizeof(GpuEventSynchronizer*),
- MPI_BYTE,
- recvRank,
- 0,
- &remoteCoordinatesReadyOnDeviceEvent,
- sizeof(GpuEventSynchronizer*),
- MPI_BYTE,
- sendRank,
- 0,
- mpi_comm_mysim_,
- MPI_STATUS_IGNORE);
- remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
-#else
- GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
-#endif
+ // no need to explicitly sync with GMX_THREAD_MPI as all operations are
+ // anyway launched in correct stream
+ communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, recvPtr, recvRank);
}
else
{
- sendPtr = static_cast<void*>(&(d_ptr[atomOffset_]));
- sendSize = fSendSize_;
- remotePtr = remoteFPtr_;
- sendRank = sendRankF_;
- recvRank = recvRankF_;
+ communicateHaloDataWithCudaMPI(sendPtr, sendSize, sendRank, recvPtr, recvSize, recvRank);
}
+}
- communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
+ int sendSize,
+ int sendRank,
+ float3* recvPtr,
+ int recvSize,
+ int recvRank)
+{
+ // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
+ if (sendSize > 0)
+ {
+ // wait for non local stream to complete all outstanding
+ // activities, to ensure that buffer is up-to-date in GPU memory
+ // before transferring to remote rank
+
+ // ToDo: Replace stream synchronize with event synchronize
+ nonLocalStream_.synchronize();
+ }
+
+ // perform halo exchange directly in device buffers
+#if GMX_MPI
+ MPI_Request request;
+
+ // recv remote data into halo region
+ MPI_Irecv(recvPtr, recvSize * DIM, MPI_FLOAT, recvRank, 0, mpi_comm_mysim_, &request);
+
+ // send data to remote halo region
+ MPI_Send(sendPtr, sendSize * DIM, MPI_FLOAT, sendRank, 0, mpi_comm_mysim_);
+
+ MPI_Wait(&request, MPI_STATUS_IGNORE);
+#endif
}
-void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
- int sendSize,
- int sendRank,
- void* remotePtr,
- int recvRank)
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
+ int sendSize,
+ int sendRank,
+ float3* remotePtr,
+ int recvRank)
{
cudaError_t stat;
// to its stream.
GpuEventSynchronizer* haloDataTransferRemote;
+ GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
+ "Halo exchange requires valid event to synchronize data transfer initiated in "
+ "remote rank");
haloDataTransferLaunched_->markEvent(nonLocalStream_);
MPI_Sendrecv(&haloDataTransferLaunched_,
sendRankF_(dd->neighbor[dimIndex][0]),
recvRankF_(dd->neighbor[dimIndex][1]),
usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
- haloDataTransferLaunched_(new GpuEventSynchronizer()),
+ haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
mpi_comm_mysim_(mpi_comm_mysim),
deviceContext_(deviceContext),
localStream_(localStream),
pulse_(pulse),
wcycle_(wcycle)
{
-
- GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
- "GPU Halo exchange is currently only supported with thread-MPI enabled");
-
if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
{
gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
private:
/*! \brief Data transfer wrapper for GPU halo exchange
- * \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory
- * \param [in] haloQuantity switch on whether X or F halo exchange is being performed
- * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+ * \param [in] sendPtr send buffer address
+ * \param [in] sendSize number of elements to send
+ * \param [in] sendRank rank of destination
+ * \param [in] recvPtr receive buffer address
+ * \param [in] recvSize number of elements to receive
+ * \param [in] recvRank rank of source
*/
- void communicateHaloData(float3* d_ptr,
- HaloQuantity haloQuantity,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+ void communicateHaloData(float3* sendPtr, int sendSize, int sendRank, float3* recvPtr, int recvSize, int recvRank);
/*! \brief Data transfer for GPU halo exchange using CUDA memcopies
* \param [inout] sendPtr address to send data from
* \param [in] sendSize number of atoms to be sent
* \param [in] sendRank rank to send data to
- * \param [inout] remotePtr remote address to recv data
+ * \param [in] remotePtr remote address to recv data
* \param [in] recvRank rank to recv data from
*/
- void communicateHaloDataWithCudaDirect(void* sendPtr, int sendSize, int sendRank, void* remotePtr, int recvRank);
+ void communicateHaloDataWithCudaDirect(float3* sendPtr, int sendSize, int sendRank, float3* remotePtr, int recvRank);
+
+ /*! \brief Data transfer wrapper for GPU halo exchange using MPI_send and MPI_Recv
+ * \param [in] sendPtr send buffer address
+ * \param [in] sendSize number of elements to send
+ * \param [in] sendRank rank of destination
+ * \param [in] recvPtr receive buffer address
+ * \param [in] recvSize number of elements to receive
+ * \param [in] recvRank rank of source
+ */
+ void communicateHaloDataWithCudaMPI(float3* sendPtr,
+ int sendSize,
+ int sendRank,
+ float3* recvPtr,
+ int recvSize,
+ int recvRank);
+
+ /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local
+ * stream \param [in] eventSync event recorded when coordinates/forces are ready to device
+ */
+ void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
//! Domain decomposition object
gmx_domdec_t* dd_ = nullptr;
//! number of home atoms - offset of local halo region
int numHomeAtoms_ = 0;
//! remote GPU coordinates buffer pointer for pushing data
- void* remoteXPtr_ = nullptr;
+ float3* remoteXPtr_ = nullptr;
//! remote GPU force buffer pointer for pushing data
- void* remoteFPtr_ = nullptr;
+ float3* remoteFPtr_ = nullptr;
//! Periodic Boundary Conditions for this rank
bool usePBC_ = false;
//! force shift buffer on device
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/mpiinfo.h"
#include "isimulator.h"
#include "membedholder.h"
devFlags.enableGpuBufferOps =
GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
- devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
+ devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
devFlags.enableGpuPmePPComm =
GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
#pragma GCC diagnostic pop
+ // Direct GPU comm path is being used with CUDA_AWARE_MPI
+ // make sure underlying MPI implementation is CUDA-aware
+ if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange)
+ {
+ const bool haveDetectedCudaAwareMpi =
+ (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
+ const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
+
+ if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
+ {
+ // CUDA-aware support not detected in MPI library but, user has forced it's use
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "This run has forced use of 'CUDA-aware MPI'. "
+ "But, GROMACS cannot determine if underlying MPI "
+ "is CUDA-aware. GROMACS recommends use of latest openMPI version "
+ "for CUDA-aware support. "
+ "If you observe failures at runtime, try unsetting "
+ "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
+ }
+
+ if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
+ {
+ devFlags.usingCudaAwareMpi = true;
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature.");
+ }
+ else
+ {
+ if (devFlags.enableGpuHaloExchange)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
+ "halo exchange' feature will not be enabled as GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuHaloExchange = false;
+ }
+
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
+ "support. "
+ "If you are certain about CUDA-aware support in your MPI library, "
+ "you can force it's use by setting environment variable "
+ " GMX_FORCE_CUDA_AWARE_MPI.");
+ }
+ }
+
if (devFlags.enableGpuBufferOps)
{
GMX_LOG(mdlog.warning)
{
physicalNodeComm.barrier();
}
- releaseDevice(deviceInfo);
+
+ if (!devFlags.usingCudaAwareMpi)
+ {
+ // Don't reset GPU in case of CUDA-AWARE MPI
+ // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
+ // resetting the device before MPI_Finalize() results in crashes inside UCX
+ releaseDevice(deviceInfo);
+ }
/* Does what it says */
print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());