From: Alan Gray Date: Tue, 26 Feb 2019 12:43:22 +0000 (-0800) Subject: GPU halo exchange X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=44f607d71afaea9e51bb9d1cc2bc132f34482f32;p=alexxy%2Fgromacs.git GPU halo exchange Activate with GMX_GPU_DD_COMMS and GMX_USE_GPU_BUFFER_OPS environment variable. Class to initialize and apply coordinate buffer halo exchange functionality directly on GPU memory space. Currently only supports direct cuda memcpy, and relies on thread MPI being in use. Updated gpucomm testing matrices to cover non-GPU case. Limitation: still only supports thread MPI, 1D data decomposition and only coordinate halo exchange Implements part of #2890 Associated with #2915 Change-Id: I8e6473481ad4d943df78d7019681bfa821bd5798 --- diff --git a/admin/builds/gpucomm-matrix.txt b/admin/builds/gpucomm-matrix.txt index a189c38786..ef7895fe30 100644 --- a/admin/builds/gpucomm-matrix.txt +++ b/admin/builds/gpucomm-matrix.txt @@ -13,14 +13,16 @@ # Test an older version of CUDA # Test MPI with CUDA -# Test MPMD PME with thread-MPI (library MPI not supported initially) +# Test multiple PP ranks with separate PME rank +# Test with thread-MPI (library MPI not supported initially) # Test GPU comm features in the above combination -gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=2 thread-mpi openmp gpucomm +gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=3 thread-mpi openmp gpucomm # Test newest gcc supported by newest CUDA at time of release +# Test multiple PP ranks without separate PME rank # Test thread-MPI with CUDA # Test GPU comm features in the above combination -gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm +gcc-8 gpuhw=nvidia nranks=2 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm # Test non-default use of mdrun -gpu_id # Test GPU-sharing among 4 PP ranks @@ -36,3 +38,6 @@ gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpucomm # Test OpenCL build with gpudev features # Test GPU comm on the OpenCL path where it is unsupported clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpucomm + +# Test a non-GPU build with the gpucomm optiom set +gcc-8 double x11 no-tng fftpack simd=sse4.1 gpucomm diff --git a/src/gromacs/domdec/CMakeLists.txt b/src/gromacs/domdec/CMakeLists.txt index 1a17c41300..c4fca1e308 100644 --- a/src/gromacs/domdec/CMakeLists.txt +++ b/src/gromacs/domdec/CMakeLists.txt @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2014,2018, by the GROMACS development team, led by +# Copyright (c) 2014,2018,2019, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -33,7 +33,12 @@ # the research papers on the package. Check out http://www.gromacs.org. file(GLOB DOMDEC_SOURCES *.cpp) -set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} PARENT_SCOPE) + +if(GMX_USE_CUDA) + file(GLOB DOMDEC_CUDA_SOURCES gpuhaloexchange_impl.cu) +endif() + +set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} ${DOMDEC_CUDA_SOURCES} PARENT_SCOPE) if (BUILD_TESTING) add_subdirectory(tests) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 5c76700e8a..f02b37a39c 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -55,6 +55,7 @@ #include "gromacs/domdec/dlbtiming.h" #include "gromacs/domdec/domdec_network.h" #include "gromacs/domdec/ga2la.h" +#include "gromacs/domdec/gpuhaloexchange.h" #include "gromacs/domdec/options.h" #include "gromacs/domdec/partition.h" #include "gromacs/gmxlib/network.h" @@ -3016,7 +3017,13 @@ getDDSettings(const gmx::MDLogger &mdlog, ddSettings.useSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0); ddSettings.dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10); - ddSettings.request1DAnd1Pulse = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)); + // TODO GPU halo exchange requires a 1D single-pulse DD, and when + // it is properly integrated the hack with GMX_GPU_DD_COMMS should + // be removed. + ddSettings.request1DAnd1Pulse = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) || + (bool(getenv("GMX_GPU_DD_COMMS") != nullptr && + GMX_THREAD_MPI && + (GMX_GPU == GMX_GPU_CUDA)))); ddSettings.useDDOrderZYX = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0)); ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1)); ddSettings.eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0); diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h index 88bc93be50..60bd04913d 100644 --- a/src/gromacs/domdec/domdec_struct.h +++ b/src/gromacs/domdec/domdec_struct.h @@ -74,6 +74,7 @@ namespace gmx { template class HashedMap; class LocalAtomSetManager; +class GpuHaloExchange; } typedef struct { @@ -226,6 +227,9 @@ struct gmx_domdec_t { //NOLINT(clang-analyzer-optin.performance.Padding) /* gmx_pme_recv_f buffer */ std::vector pmeForceReceiveBuffer; + + /* GPU halo exchange object */ + std::unique_ptr gpuHaloExchange; }; //! Are we the master node for domain decomposition diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h new file mode 100644 index 0000000000..58f908b76a --- /dev/null +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -0,0 +1,108 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Declaration of GPU halo exchange. + * + * \author Alan Gray + * \inlibraryapi + * \ingroup module_domdec + */ +#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H +#define GMX_DOMDEC_GPUHALOEXCHANGE_H + +#include "gromacs/math/vectypes.h" +#include "gromacs/utility/basedefinitions.h" +#include "gromacs/utility/classhelpers.h" +#include "gromacs/utility/gmxmpi.h" + +struct gmx_domdec_t; + +namespace gmx +{ + +/*! \libinternal + * \brief Manages GPU Halo Exchange object */ +class GpuHaloExchange +{ + + public: + /*! \brief Creates GPU Halo Exchange object. + * + * Halo exchange will be performed in \c streamNonLocal, and + * the main communicateHaloCoordinates method must be called + * before any subsequent operations that access non-local + * parts of the coordinate buffer (such as the non-local + * non-bonded kernels). It also must be called after the local + * coordinates buffer operations (where the coordinates are + * copied to the device and hence the \c + * coordinatesOnDeviceEvent is recorded). + * + * \param [inout] dd domdec structure + * \param [in] mpi_comm_mysim communicator used for simulation + * \param [in] streamNonLocal non-local NB CUDA stream. + * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device + */ + GpuHaloExchange(gmx_domdec_t *dd, + MPI_Comm mpi_comm_mysim, + void *streamNonLocal, + void *coordinatesOnDeviceEvent); + ~GpuHaloExchange(); + + /*! \brief + * + * Initialization for GPU halo exchange of coordinates buffer + * \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory + */ + void reinitHalo(rvec *d_coordinateBuffer); + + + /*! \brief GPU halo exchange of coordinates buffer. + * + * Must be called after local setCoordinates (which records an + * event when the coordinate data has been copied to the + * device). + * \param [in] box Coordinate box (from which shifts will be constructed) + */ + void communicateHaloCoordinates(const matrix box); + + private: + class Impl; + gmx::PrivateImplPointer impl_; + +}; + +} //namespace gmx + +#endif diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp new file mode 100755 index 0000000000..4b491eac36 --- /dev/null +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -0,0 +1,89 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief May be used to implement Domdec CUDA interfaces for non-GPU builds. + * + * Currently, reports and exits if any of the interfaces are called. + * Needed to satisfy compiler on systems, where CUDA is not available. + * + * \author Alan Gray + * + * \ingroup module_domdec + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/domdec/gpuhaloexchange.h" + +#if GMX_GPU != GMX_GPU_CUDA + +namespace gmx +{ + +/*!\brief Impl class stub. */ +class GpuHaloExchange::Impl +{ +}; + +/*!\brief Constructor stub. */ +GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */, + MPI_Comm /* mpi_comm_mysim */, + void * /*streamNonLocal */, + void * /*coordinatesOnDeviceEvent*/) + : impl_(nullptr) +{ + GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); +} + +GpuHaloExchange::~GpuHaloExchange() = default; + +/*!\brief init halo exhange stub. */ +void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */) +{ + GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); +} + +/*!\brief apply X halo exchange stub. */ +void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */) +{ + GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation."); +} + + +} // namespace gmx + +#endif /* GMX_GPU != GMX_GPU_CUDA */ diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu new file mode 100644 index 0000000000..4e8ba35dd4 --- /dev/null +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -0,0 +1,344 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements GPU halo exchange using CUDA. + * + * + * \author Alan Gray + * + * \ingroup module_domdec + */ +#include "gmxpre.h" + +#include "gpuhaloexchange_impl.cuh" + +#include "config.h" + +#include +#include + +#include "gromacs/domdec/domdec.h" +#include "gromacs/domdec/domdec_struct.h" +#include "gromacs/domdec/gpuhaloexchange.h" +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/pbcutil/ishift.h" + +#include "domdec_internal.h" + +namespace gmx +{ + +//! Number of CUDA threads in a block +//TODO Optimize this through experimentation +constexpr static int c_threadsPerBlock = 256; + +template +__global__ void packSendBufKernel(float3 * __restrict__ dataPacked, + const float3 * __restrict__ data, + const int * __restrict__ map, + const int mapSize, + const float3 coordinateShift) +{ + int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; + float3 *gm_dataDest = &dataPacked[threadIndex]; + const float3 *gm_dataSrc = &data[map[threadIndex]]; + + if (threadIndex < mapSize) + { + if (usePBC) + { + *gm_dataDest = *gm_dataSrc + coordinateShift; + } + else + { + *gm_dataDest = *gm_dataSrc; + } + + } + + return; +} + +void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer) +{ + + d_x_ = d_coordinatesBuffer; + + cudaStream_t stream = nonLocalStream_; + int nzone = 1; + const gmx_domdec_comm_t &comm = *dd_->comm; + const gmx_domdec_comm_dim_t &cd = comm.cd[0]; + const gmx_domdec_ind_t &ind = cd.ind[0]; + int newSize = ind.nsend[nzone+1]; + + GMX_RELEASE_ASSERT(cd.numPulses() == 1, "Multiple pulses are not yet supported in GPU halo exchange"); + GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange"); + + // reallocates only if needed + h_indexMap_.resize(newSize); + // reallocate on device only if needed + if (newSize > maxPackedBufferSize_) + { + reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr); + reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr); + reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr); + maxPackedBufferSize_ = newSize; + } + + xSendSize_ = newSize; + MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0, + &xRecvSize_, sizeof(int), MPI_BYTE, recvRankX_, 0, + mpi_comm_mysim_, MPI_STATUS_IGNORE); + fSendSize_ = xRecvSize_; + fRecvSize_ = xSendSize_; + + localOffset_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank + + GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch"); + std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); + + copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, GpuApiCallBehavior::Async, nullptr); + + // This rank will push data to its neighbor, so needs to know + // the remote receive address and similarly send its receive + // address to other neighbour. We can do this here in reinit fn + // since the pointers will not change until the next NS step. + + //Coordinates buffer: + void* recvPtr = static_cast (&d_coordinatesBuffer[localOffset_]); + MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, + &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0, + mpi_comm_mysim_, MPI_STATUS_IGNORE); + + //Force buffer: + recvPtr = static_cast (d_recvBuf_); + MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0, + &remoteFPtr_, sizeof(void*), MPI_BYTE, sendRankF_, 0, + mpi_comm_mysim_, MPI_STATUS_IGNORE); + + + return; +} + +// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_ +// event when the coordinate data has been copied to the device). +void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box) +{ + + //ensure stream waits until coordinate data is available on device + coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_); + + // launch kernel to pack send buffer + KernelLaunchConfig config; + config.blockSize[0] = c_threadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = (xSendSize_+c_threadsPerBlock-1)/c_threadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + config.sharedMemorySize = 0; + config.stream = nonLocalStream_; + + const float3 *sendBuf = d_sendBuf_; + const float3 *d_x = d_x_; + const int *indexMap = d_indexMap_; + const int size = xSendSize_; + // The coordinateShift changes between steps when we have + // performed a DD partition, or have updated the box e.g. when + // performing pressure coupling. So, for simplicity, the the box + // is used every step to pass the shift vector as an argument of + // the packing kernel. + // + // Because only one-dimensional DD is supported, the coordinate + // shift only needs to handle that dimension. + const int dimensionIndex = dd_->dim[0]; + const float3 coordinateShift { + box[dimensionIndex][XX], box[dimensionIndex][YY], box[dimensionIndex][ZZ] + }; + + // Avoid launching kernel when there is no work to do + if (size > 0) + { + auto kernelFn = usePBC_ ? packSendBufKernel : packSendBufKernel; + + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap, + &size, &coordinateShift); + + launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs); + } + + communicateHaloData(d_x_, HaloQuantity::HaloCoordinates); + + return; +} + + +void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr, + HaloQuantity haloQuantity) +{ + + void * sendPtr; + int sendSize; + void * remotePtr; + int sendRank; + int recvRank; + if (haloQuantity == HaloQuantity::HaloCoordinates) + { + sendPtr = static_cast (d_sendBuf_); + sendSize = xSendSize_; + remotePtr = remoteXPtr_; + sendRank = sendRankX_; + recvRank = recvRankX_; + } + else + { + sendPtr = static_cast (&(d_ptr[localOffset_])); + sendSize = fSendSize_; + remotePtr = remoteFPtr_; + sendRank = sendRankF_; + recvRank = recvRankF_; + } + + communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank); +} + + +void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr, + int sendSize, + int sendRank, + void *remotePtr, + int recvRank) +{ + + cudaError_t stat; + + // We asynchronously push data to remote rank. The remote + // destination pointer has already been set in the init fn. We + // don't need to worry about overwriting data the remote ranks + // still needs since the halo exchange is just done once per + // timestep, for each of X and F. + + // send data to neighbor, if any data exists to send + if (sendSize > 0) + { + stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_); + CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed"); + } + + //ensure pushed data has arrived before remote rank progresses + // This rank records an event and sends it to the remote rank which has just been pushed data. + // This rank recieves event from remote rank which has pushed data here, and enqueues that event to + // its stream. + GpuEventSynchronizer *haloDataTransferRemote; + + haloDataTransferLaunched_->markEvent(nonLocalStream_); + + MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0, + &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0, + mpi_comm_mysim_, MPI_STATUS_IGNORE); + + haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_); + +} + +/*! \brief Create Domdec GPU object */ +GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd, + MPI_Comm mpi_comm_mysim, + void * nonLocalStream, + void * coordinatesOnDeviceEvent) + : dd_(dd), + sendRankX_(dd->neighbor[0][1]), + recvRankX_(dd->neighbor[0][0]), + sendRankF_(dd->neighbor[0][0]), + recvRankF_(dd->neighbor[0][1]), + usePBC_(dd->ci[dd->dim[0]] == 0), + haloDataTransferLaunched_(new GpuEventSynchronizer()), + mpi_comm_mysim_(mpi_comm_mysim), + nonLocalStream_(*static_cast (nonLocalStream)), + coordinatesOnDeviceEvent_(static_cast (coordinatesOnDeviceEvent)) +{ + + GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled"); + + if (dd->ndim > 1) + { + gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange"); + } + + if (usePBC_ && dd->unitCellInfo.haveScrewPBC) + { + gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n"); + } + + changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported); + + allocateDeviceBuffer(&d_fShift_, 1, nullptr); + +} + +GpuHaloExchange::Impl::~Impl() +{ + freeDeviceBuffer(&d_indexMap_); + freeDeviceBuffer(&d_sendBuf_); + freeDeviceBuffer(&d_recvBuf_); + freeDeviceBuffer(&d_fShift_); + delete haloDataTransferLaunched_; +} + +GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd, + MPI_Comm mpi_comm_mysim, + void *nonLocalStream, + void *coordinatesOnDeviceEvent) + : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent)) +{ +} + +GpuHaloExchange::~GpuHaloExchange() = default; + +void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer) +{ + impl_->reinitHalo(reinterpret_cast(d_coordinatesBuffer)); +} + +void GpuHaloExchange::communicateHaloCoordinates(const matrix box) +{ + impl_->communicateHaloCoordinates(box); +} + +} //namespace gmx diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh new file mode 100644 index 0000000000..bd125654ee --- /dev/null +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -0,0 +1,180 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declares CUDA implementation of GPU Halo Exchange. + * + * This header file is needed to include from both the device-side + * kernels file, and the host-side management code. + * + * \author Alan Gray + * + * \ingroup module_domdec + */ +#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H +#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H + +#include "gromacs/domdec/gpuhaloexchange.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/gpu_utils/hostallocator.h" + +namespace gmx +{ + +/*! \brief switch for whether coordinates or force halo is being applied */ +enum class HaloQuantity +{ + HaloCoordinates, HaloForces +}; + +/*! \internal \brief Class with interfaces and data for GPU Halo Exchange */ +class GpuHaloExchange::Impl +{ + + public: + /*! \brief Creates GPU Halo Exchange object. + * + * \param [inout] dd domdec structure + * \param [in] mpi_comm_mysim communicator used for simulation + * \param [in] nonLocalStream non-local NB CUDA stream + * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device + */ + Impl(gmx_domdec_t *dd, + MPI_Comm mpi_comm_mysim, + void *nonLocalStream, + void *coordinatesOnDeviceEvent); + ~Impl(); + + /*! \brief + * (Re-) Initialization for GPU halo exchange + * \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory + */ + void reinitHalo(float3 *d_coordinatesBuffer); + + + /*! \brief + * GPU halo exchange of coordinates buffer + * \param [in] box Coordinate box (from which shifts will be constructed) + */ + void communicateHaloCoordinates(const matrix box); + + private: + + /*! \brief Data transfer wrapper for GPU halo exchange + * \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory + * \param [in] haloQuantity switch on whether X or F halo exchange is being performed + */ + void communicateHaloData(float3 *d_ptr, + HaloQuantity haloQuantity); + + /*! \brief Data transfer for GPU halo exchange using CUDA memcopies + * \param [inout] sendPtr address to send data from + * \param [in] sendSize number of atoms to be sent + * \param [in] sendRank rank to send data to + * \param [inout] remotePtr remote address to recv data + * \param [in] recvRank rank to recv data from + */ + void communicateHaloDataWithCudaDirect(void *sendPtr, + int sendSize, + int sendRank, + void* remotePtr, + int recvRank); + + //! Domain decomposition object + gmx_domdec_t *dd_ = nullptr; + //! map of indices to be sent from this rank + gmx::HostVector h_indexMap_; + //! device copy of index map + int *d_indexMap_ = nullptr; + //! number of elements in index map array + int indexMapSize_ = -1; + //! number of elements allocated in index map array + int indexMapSizeAlloc_ = -1; + //! device buffer for sending packed data + float3 *d_sendBuf_ = nullptr; + //! number of atoms in sendbuf array + int sendBufSize_ = -1; + //! number of atoms allocated in sendbuf array + int sendBufSizeAlloc_ = -1; + //! device buffer for receiving packed data + float3 *d_recvBuf_ = nullptr; + //! maximum size of packed buffer + int maxPackedBufferSize_ = 0; + //! number of atoms in recvbuf array + int recvBufSize_ = -1; + //! number of atoms allocated in recvbuf array + int recvBufSizeAlloc_ = -1; + //! rank to send data to for X + int sendRankX_ = 0; + //! rank to recv data from for X + int recvRankX_ = 0; + //! rank to send data to for F + int sendRankF_ = 0; + //! rank to recv data from for F + int recvRankF_ = 0; + //! send copy size from this rank for X + int xSendSize_ = 0; + //! recv copy size to this rank for X + int xRecvSize_ = 0; + //! send copy size from this rank for F + int fSendSize_ = 0; + //! recv copy size to this rank for F + int fRecvSize_ = 0; + //! offset of local halo region + int localOffset_ = 0; + //! remote GPU coordinates buffer pointer for pushing data + void *remoteXPtr_ = 0; + //! remote GPU force buffer pointer for pushing data + void *remoteFPtr_ = 0; + //! Periodic Boundary Conditions for this rank + bool usePBC_ = false; + //! force shift buffer on device + float3 * d_fShift_ = nullptr; + //! Event triggered when halo transfer has been launched with direct CUD memory copy + GpuEventSynchronizer *haloDataTransferLaunched_ = nullptr; + //! MPI communicator used for simulation + MPI_Comm mpi_comm_mysim_; + //! CUDA stream for non-local non-bonded calculations + cudaStream_t nonLocalStream_ = nullptr; + //! Event triggered when coordinates have been copied to device + GpuEventSynchronizer *coordinatesOnDeviceEvent_ = nullptr; + //! full coordinates buffer in GPU memory + float3 *d_x_ = nullptr; + +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index c540fa86f6..e2fffacd83 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -49,6 +49,7 @@ #include "gromacs/domdec/dlbtiming.h" #include "gromacs/domdec/domdec.h" #include "gromacs/domdec/domdec_struct.h" +#include "gromacs/domdec/gpuhaloexchange.h" #include "gromacs/domdec/partition.h" #include "gromacs/essentialdynamics/edsam.h" #include "gromacs/ewald/pme.h" @@ -125,6 +126,10 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP // TODO eventially tie this in with other existing GPU flags. static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); +/*! \brief environment variable to enable GPU P2P communication */ +static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr) + && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA); + static void sum_forces(rvec f[], gmx::ArrayRef forceToAdd) { const int end = forceToAdd.size(); @@ -1180,6 +1185,12 @@ void do_force(FILE *fplog, launchPmeGpuFftAndGather(fr->pmedata, wcycle); } + const bool ddUsesGpuDirectCommunication + = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr); + gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr; + GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr, + "Must have valid gpuHaloExchange when doing halo exchange on the GPU"); + /* Communicate coordinates and sum dipole if necessary + do non-local pair search */ if (havePPDomainDecomposition(cr)) @@ -1196,15 +1207,36 @@ void do_force(FILE *fplog, nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); wallcycle_stop(wcycle, ewcNS); + if (ddUsesGpuDirectCommunication) + { + rvec* d_x = static_cast (nbv->get_gpu_xrvec()); + gpuHaloExchange->reinitHalo(d_x); + } } else { - dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); + if (ddUsesGpuDirectCommunication) + { + // The following must be called after local setCoordinates (which records an event + // when the coordinate data has been copied to the device). + gpuHaloExchange->communicateHaloCoordinates(box); + + // TODO Force flags should include haveFreeEnergyWork for this domain + if (forceWork.haveCpuBondedWork || (fr->efep != efepNO)) + { + //non-local part of coordinate buffer must be copied back to host for CPU work + nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal); + } + } + else + { + dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); + } if (useGpuXBufOps == BufferOpsUseGpu::True) { // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr) - if (!useGpuPme) + if (!useGpuPme && !ddUsesGpuDirectCommunication) { nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false, x.unpaddedArrayRef()); @@ -1403,6 +1435,13 @@ void do_force(FILE *fplog, update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box); } + // TODO Force flags should include haveFreeEnergyWork for this domain + if (ddUsesGpuDirectCommunication && + (forceWork.haveCpuBondedWork || (fr->efep != efepNO))) + { + /* Wait for non-local coordinate data to be copied from device */ + nbv->wait_nonlocal_x_copy_D2H_done(); + } /* Compute the bonded and non-bonded energies and optionally forces */ do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, @@ -1419,8 +1458,10 @@ void do_force(FILE *fplog, forceFlags, &forceOut.forceWithVirial(), enerd, ed, forceFlags.doNeighborSearch); - bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction; - bool haveCpuForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuFPmeReduction); + const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction; + // TODO Force flags should include haveFreeEnergyWork for this domain + const bool haveCpuForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || + useCpuFPmeReduction || (fr->efep != efepNO)); // Will store the amount of cycles spent waiting for the GPU that // will be later used in the DLB accounting. diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 5caa3caee5..68348803b2 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -59,6 +59,7 @@ #include "gromacs/commandline/filenm.h" #include "gromacs/domdec/domdec.h" #include "gromacs/domdec/domdec_struct.h" +#include "gromacs/domdec/gpuhaloexchange.h" #include "gromacs/domdec/localatomsetmanager.h" #include "gromacs/domdec/partition.h" #include "gromacs/ewald/ewald_utils.h" @@ -161,18 +162,25 @@ namespace gmx { -/*! \brief Log if development feature flags are encountered +/*! \brief environment variable to enable GPU P2P communication */ +static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr) + && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA); + +/*! \brief Manage any development feature flag variables encountered * - * The use of dev features indicated by environment variables is logged - * in order to ensure that runs with such featrues enabled can be identified - * from their log and standard output. + * The use of dev features indicated by environment variables is + * logged in order to ensure that runs with such featrues enabled can + * be identified from their log and standard output. Any cross + * dependencies are also checked, and if unsatisified, a fatal error + * issued. * * \param[in] mdlog Logger object. */ -static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog) +static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog) { const bool enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr); + const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA)); if (enableGpuBufOps) { @@ -180,11 +188,22 @@ static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog) "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable."); } + if (enableGpuHaloExchange) + { + if (!enableGpuBufOps) + { + gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n"); + } + GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( + "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable."); + } + if (useGpuUpdateConstrain) { GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable."); } + } /*! \brief Barrier for safe simultaneous thread access to mdrunner data @@ -654,7 +673,7 @@ int Mdrunner::mdrunner() gmx::MDLogger mdlog(logOwner.logger()); // report any development features that may be enabled by environment variables - reportDevelopmentFeatures(mdlog); + manageDevelopmentFeatures(mdlog); // With thread-MPI, the communicator changes after threads are // launched, so this is rebuilt for the master rank at that @@ -1316,6 +1335,15 @@ int Mdrunner::mdrunner() pforce, wcycle); + // TODO Move this to happen during domain decomposition setup, + // once stream and event handling works well with that. + if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded) + { + void *stream = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal); + void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event(); + cr->dd->gpuHaloExchange = std::make_unique(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent); + } + /* Initialize the mdAtoms structure. * mdAtoms is not filled with atom data, * as this can not be done now with domain decomposition. diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 5c72c7faad..9715d2985f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -111,7 +111,6 @@ #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu" #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */ - namespace Nbnxm { @@ -777,6 +776,11 @@ void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid &grid, copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms, stream, GpuApiCallBehavior::Async, nullptr); + if (interactionLoc == Nbnxm::InteractionLocality::Local) + { + nb->xAvailableOnDevice->markEvent(stream); + } + if (bDoTime) { nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream); @@ -1011,6 +1015,47 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality atomLocality, return; } +void nbnxn_launch_copy_x_from_gpu(const AtomLocality atomLocality, + const Nbnxm::GridSet &gridSet, + gmx_nbnxn_gpu_t *nb, + rvec *x) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + GMX_ASSERT(x, "Need a valid x pointer"); + + const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); + cudaStream_t stream = nb->stream[iLocality]; + + bool bDoTime = nb->bDoTime; + cu_timers_t *t = nb->timers; + int atomStart, nAtoms; + + nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms); + + if (bDoTime) + { + t->xf[atomLocality].nb_d2h.openTimingRegion(stream); + } + + GMX_ASSERT(nb->xrvec, "Need a valid nb->xrvec pointer"); + rvec *ptrDest = reinterpret_cast (x[atomStart]); + rvec *ptrSrc = reinterpret_cast (nb->xrvec[atomStart]); + copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms, + stream, GpuApiCallBehavior::Async, stream); + + if (atomLocality == AtomLocality::NonLocal) + { + nb->xNonLocalCopyD2HDone->markEvent(stream); + } + + if (bDoTime) + { + t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); + } + + return; +} + void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality, gmx_nbnxn_gpu_t *nb) { @@ -1024,4 +1069,19 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomL } +void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv) +{ + return static_cast (gpu_nbv->xrvec); +} + +void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb) +{ + return static_cast (nb->xAvailableOnDevice); +} + +void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t *nb) +{ + nb->xNonLocalCopyD2HDone->waitForEvent(); +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 551f5c2226..bdeefcbb66 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -51,6 +51,7 @@ // TODO Remove this comment when the above order issue is resolved #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/pmalloc_cuda.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/math/vectypes.h" @@ -481,6 +482,9 @@ gpu_init(const gmx_device_info_t *deviceInfo, stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming); CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed"); + nb->xAvailableOnDevice = new GpuEventSynchronizer(); + nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer(); + /* WARNING: CUDA timings are incorrect with multiple streams. * This is the main reason why they are disabled by default. */ @@ -899,6 +903,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, const int atomIndicesSize = gridSet.atomIndices().size(); const int *cxy_na = grid.cxy_na().data(); const int *cxy_ind = grid.cxy_ind().data(); + // TODO Should be done once per gridset const int numRealAtomsTotal = gridSet.numRealAtomsTotal(); reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index c18cd3f9a2..ec4b2b8d33 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -206,6 +206,8 @@ using cu_plist_t = Nbnxm::gpu_plist; */ typedef struct Nbnxm::gpu_timers_t cu_timers_t; +class GpuEventSynchronizer; + /** \internal * \brief Main data structure for CUDA nonbonded force calculations. */ @@ -278,6 +280,16 @@ struct gmx_nbnxn_cuda_t gmx::EnumerationArray haveWork; + GpuEventSynchronizer *xAvailableOnDevice; /**< event triggered when + coordinate buffer has been + copied to device by PP task and + any dependent task (e.g. transfer of coordinates + to the PME rank's GPU) can proceed. */ + + GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when + non-local coordinate buffer has been + copied from device to host*/ + /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. * Timer init/uninit is still done even with timing off so only the condition diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 68eac205a4..031033dea9 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -302,9 +302,32 @@ void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocali f); } +void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality) +{ + nbnxn_launch_copy_x_from_gpu(locality, + pairSearch_->gridSet(), + gpu_nbv, + x); +} + void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality) { nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv); } +void* nonbonded_verlet_t::get_gpu_xrvec() +{ + return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv); +} + +void* nonbonded_verlet_t::get_x_on_device_event() +{ + return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv); +} + +void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done() +{ + Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv); +} + /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index db550859df..7a27a5cb4e 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -388,12 +388,24 @@ struct nonbonded_verlet_t /*! \brief D2H transfer of force buffer*/ void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality); + /*! \brief D2H transfer of coordinate buffer*/ + void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality); + /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete * * FIXME: need more details: when should be called / after which operation, etc. */ void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality); + /*! \brief return GPU pointer to x in rvec format */ + void* get_gpu_xrvec(); + + /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */ + void* get_x_on_device_event(); + + /*! \brief Wait for non-local copy of coordinate buffer from device to host */ + void wait_nonlocal_x_copy_D2H_done(); + //! Return the kernel setup const Nbnxm::KernelSetup &kernelSetup() const { diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 3794c98320..743bcffa20 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -370,12 +370,46 @@ void nbnxn_launch_copy_f_from_gpu(AtomLocality gmx_unused atomLocali gmx_nbnxn_gpu_t gmx_unused *nb, rvec gmx_unused *f) CUDA_FUNC_TERM; +/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU + * \param[in] atomLocality Locality for data trasnfer + * \param[in] gridSet The Grid Set data object + * \param[in] nb The nonbonded data GPU structure + * \param[out] x Coordinate buffer on CPU + */ +CUDA_FUNC_QUALIFIER +void nbnxn_launch_copy_x_from_gpu(AtomLocality gmx_unused atomLocality, + const Nbnxm::GridSet gmx_unused &gridSet, + gmx_nbnxn_gpu_t gmx_unused *nb, + rvec gmx_unused *x) CUDA_FUNC_TERM; + /*! \brief Wait for GPU stream to complete */ CUDA_FUNC_QUALIFIER void nbnxn_wait_for_gpu_force_reduction(AtomLocality gmx_unused atomLocality, gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; +/*! \brief sync CPU thread on coordinate copy to device + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; -} // namespace Nbnxm +/*! \brief return pointer to event recorded when coordinates have been copied to device + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief return GPU pointer to x in rvec format + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief Wait for non-local copy of coordinate buffer from device to host + * \param[in] nb The nonbonded data GPU structure + */ +CUDA_FUNC_QUALIFIER +void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; +} // namespace Nbnxm #endif