# Test an older version of CUDA
# Test MPI with CUDA
-# Test MPMD PME with thread-MPI (library MPI not supported initially)
+# Test multiple PP ranks with separate PME rank
+# Test with thread-MPI (library MPI not supported initially)
# Test GPU comm features in the above combination
-gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=2 thread-mpi openmp gpucomm
+gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=3 thread-mpi openmp gpucomm
# Test newest gcc supported by newest CUDA at time of release
+# Test multiple PP ranks without separate PME rank
# Test thread-MPI with CUDA
# Test GPU comm features in the above combination
-gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
+gcc-8 gpuhw=nvidia nranks=2 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
# Test non-default use of mdrun -gpu_id
# Test GPU-sharing among 4 PP ranks
# Test OpenCL build with gpudev features
# Test GPU comm on the OpenCL path where it is unsupported
clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpucomm
+
+# Test a non-GPU build with the gpucomm optiom set
+gcc-8 double x11 no-tng fftpack simd=sse4.1 gpucomm
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2014,2018, by the GROMACS development team, led by
+# Copyright (c) 2014,2018,2019, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# the research papers on the package. Check out http://www.gromacs.org.
file(GLOB DOMDEC_SOURCES *.cpp)
-set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} PARENT_SCOPE)
+
+if(GMX_USE_CUDA)
+ file(GLOB DOMDEC_CUDA_SOURCES gpuhaloexchange_impl.cu)
+endif()
+
+set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} ${DOMDEC_CUDA_SOURCES} PARENT_SCOPE)
if (BUILD_TESTING)
add_subdirectory(tests)
#include "gromacs/domdec/dlbtiming.h"
#include "gromacs/domdec/domdec_network.h"
#include "gromacs/domdec/ga2la.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/options.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/gmxlib/network.h"
ddSettings.useSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
ddSettings.dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
- ddSettings.request1DAnd1Pulse = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
+ // TODO GPU halo exchange requires a 1D single-pulse DD, and when
+ // it is properly integrated the hack with GMX_GPU_DD_COMMS should
+ // be removed.
+ ddSettings.request1DAnd1Pulse = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) ||
+ (bool(getenv("GMX_GPU_DD_COMMS") != nullptr &&
+ GMX_THREAD_MPI &&
+ (GMX_GPU == GMX_GPU_CUDA))));
ddSettings.useDDOrderZYX = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
ddSettings.eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
{
template <typename T> class HashedMap;
class LocalAtomSetManager;
+class GpuHaloExchange;
}
typedef struct {
/* gmx_pme_recv_f buffer */
std::vector<gmx::RVec> pmeForceReceiveBuffer;
+
+ /* GPU halo exchange object */
+ std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
};
//! Are we the master node for domain decomposition
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declaration of GPU halo exchange.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ * \inlibraryapi
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_H
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/gmxmpi.h"
+
+struct gmx_domdec_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages GPU Halo Exchange object */
+class GpuHaloExchange
+{
+
+ public:
+ /*! \brief Creates GPU Halo Exchange object.
+ *
+ * Halo exchange will be performed in \c streamNonLocal, and
+ * the main communicateHaloCoordinates method must be called
+ * before any subsequent operations that access non-local
+ * parts of the coordinate buffer (such as the non-local
+ * non-bonded kernels). It also must be called after the local
+ * coordinates buffer operations (where the coordinates are
+ * copied to the device and hence the \c
+ * coordinatesOnDeviceEvent is recorded).
+ *
+ * \param [inout] dd domdec structure
+ * \param [in] mpi_comm_mysim communicator used for simulation
+ * \param [in] streamNonLocal non-local NB CUDA stream.
+ * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+ */
+ GpuHaloExchange(gmx_domdec_t *dd,
+ MPI_Comm mpi_comm_mysim,
+ void *streamNonLocal,
+ void *coordinatesOnDeviceEvent);
+ ~GpuHaloExchange();
+
+ /*! \brief
+ *
+ * Initialization for GPU halo exchange of coordinates buffer
+ * \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory
+ */
+ void reinitHalo(rvec *d_coordinateBuffer);
+
+
+ /*! \brief GPU halo exchange of coordinates buffer.
+ *
+ * Must be called after local setCoordinates (which records an
+ * event when the coordinate data has been copied to the
+ * device).
+ * \param [in] box Coordinate box (from which shifts will be constructed)
+ */
+ void communicateHaloCoordinates(const matrix box);
+
+ private:
+ class Impl;
+ gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} //namespace gmx
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement Domdec CUDA interfaces for non-GPU builds.
+ *
+ * Currently, reports and exits if any of the interfaces are called.
+ * Needed to satisfy compiler on systems, where CUDA is not available.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub. */
+class GpuHaloExchange::Impl
+{
+};
+
+/*!\brief Constructor stub. */
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
+ MPI_Comm /* mpi_comm_mysim */,
+ void * /*streamNonLocal */,
+ void * /*coordinatesOnDeviceEvent*/)
+ : impl_(nullptr)
+{
+ GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+/*!\brief init halo exhange stub. */
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+{
+ GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+/*!\brief apply X halo exchange stub. */
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+{
+ GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
+}
+
+
+} // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU halo exchange using CUDA.
+ *
+ *
+ * \author Alan Gray <alang@nvidia.com.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "gpuhaloexchange_impl.cuh"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "domdec_internal.h"
+
+namespace gmx
+{
+
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_threadsPerBlock = 256;
+
+template <bool usePBC>
+__global__ void packSendBufKernel(float3 * __restrict__ dataPacked,
+ const float3 * __restrict__ data,
+ const int * __restrict__ map,
+ const int mapSize,
+ const float3 coordinateShift)
+{
+ int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
+ float3 *gm_dataDest = &dataPacked[threadIndex];
+ const float3 *gm_dataSrc = &data[map[threadIndex]];
+
+ if (threadIndex < mapSize)
+ {
+ if (usePBC)
+ {
+ *gm_dataDest = *gm_dataSrc + coordinateShift;
+ }
+ else
+ {
+ *gm_dataDest = *gm_dataSrc;
+ }
+
+ }
+
+ return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3 *d_coordinatesBuffer)
+{
+
+ d_x_ = d_coordinatesBuffer;
+
+ cudaStream_t stream = nonLocalStream_;
+ int nzone = 1;
+ const gmx_domdec_comm_t &comm = *dd_->comm;
+ const gmx_domdec_comm_dim_t &cd = comm.cd[0];
+ const gmx_domdec_ind_t &ind = cd.ind[0];
+ int newSize = ind.nsend[nzone+1];
+
+ GMX_RELEASE_ASSERT(cd.numPulses() == 1, "Multiple pulses are not yet supported in GPU halo exchange");
+ GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
+
+ // reallocates only if needed
+ h_indexMap_.resize(newSize);
+ // reallocate on device only if needed
+ if (newSize > maxPackedBufferSize_)
+ {
+ reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
+ reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+ maxPackedBufferSize_ = newSize;
+ }
+
+ xSendSize_ = newSize;
+ MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0,
+ &xRecvSize_, sizeof(int), MPI_BYTE, recvRankX_, 0,
+ mpi_comm_mysim_, MPI_STATUS_IGNORE);
+ fSendSize_ = xRecvSize_;
+ fRecvSize_ = xSendSize_;
+
+ localOffset_ = comm.atomRanges.numHomeAtoms(); //offset for data recieved by this rank
+
+ GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
+ std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
+
+ copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, GpuApiCallBehavior::Async, nullptr);
+
+ // This rank will push data to its neighbor, so needs to know
+ // the remote receive address and similarly send its receive
+ // address to other neighbour. We can do this here in reinit fn
+ // since the pointers will not change until the next NS step.
+
+ //Coordinates buffer:
+ void* recvPtr = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+ MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
+ &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
+ mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+ //Force buffer:
+ recvPtr = static_cast<void*> (d_recvBuf_);
+ MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0,
+ &remoteFPtr_, sizeof(void*), MPI_BYTE, sendRankF_, 0,
+ mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+
+ return;
+}
+
+// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
+// event when the coordinate data has been copied to the device).
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+{
+
+ //ensure stream waits until coordinate data is available on device
+ coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+
+ // launch kernel to pack send buffer
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_threadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = (xSendSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+ config.gridSize[1] = 1;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+ config.stream = nonLocalStream_;
+
+ const float3 *sendBuf = d_sendBuf_;
+ const float3 *d_x = d_x_;
+ const int *indexMap = d_indexMap_;
+ const int size = xSendSize_;
+ // The coordinateShift changes between steps when we have
+ // performed a DD partition, or have updated the box e.g. when
+ // performing pressure coupling. So, for simplicity, the the box
+ // is used every step to pass the shift vector as an argument of
+ // the packing kernel.
+ //
+ // Because only one-dimensional DD is supported, the coordinate
+ // shift only needs to handle that dimension.
+ const int dimensionIndex = dd_->dim[0];
+ const float3 coordinateShift {
+ box[dimensionIndex][XX], box[dimensionIndex][YY], box[dimensionIndex][ZZ]
+ };
+
+ // Avoid launching kernel when there is no work to do
+ if (size > 0)
+ {
+ auto kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
+
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap,
+ &size, &coordinateShift);
+
+ launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+ }
+
+ communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+
+ return;
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloData(float3 * d_ptr,
+ HaloQuantity haloQuantity)
+{
+
+ void * sendPtr;
+ int sendSize;
+ void * remotePtr;
+ int sendRank;
+ int recvRank;
+ if (haloQuantity == HaloQuantity::HaloCoordinates)
+ {
+ sendPtr = static_cast<void*> (d_sendBuf_);
+ sendSize = xSendSize_;
+ remotePtr = remoteXPtr_;
+ sendRank = sendRankX_;
+ recvRank = recvRankX_;
+ }
+ else
+ {
+ sendPtr = static_cast<void*> (&(d_ptr[localOffset_]));
+ sendSize = fSendSize_;
+ remotePtr = remoteFPtr_;
+ sendRank = sendRankF_;
+ recvRank = recvRankF_;
+ }
+
+ communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
+ int sendSize,
+ int sendRank,
+ void *remotePtr,
+ int recvRank)
+{
+
+ cudaError_t stat;
+
+ // We asynchronously push data to remote rank. The remote
+ // destination pointer has already been set in the init fn. We
+ // don't need to worry about overwriting data the remote ranks
+ // still needs since the halo exchange is just done once per
+ // timestep, for each of X and F.
+
+ // send data to neighbor, if any data exists to send
+ if (sendSize > 0)
+ {
+ stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+ CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
+ }
+
+ //ensure pushed data has arrived before remote rank progresses
+ // This rank records an event and sends it to the remote rank which has just been pushed data.
+ // This rank recieves event from remote rank which has pushed data here, and enqueues that event to
+ // its stream.
+ GpuEventSynchronizer *haloDataTransferRemote;
+
+ haloDataTransferLaunched_->markEvent(nonLocalStream_);
+
+ MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
+ &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
+ mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+ haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+
+}
+
+/*! \brief Create Domdec GPU object */
+GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
+ MPI_Comm mpi_comm_mysim,
+ void * nonLocalStream,
+ void * coordinatesOnDeviceEvent)
+ : dd_(dd),
+ sendRankX_(dd->neighbor[0][1]),
+ recvRankX_(dd->neighbor[0][0]),
+ sendRankF_(dd->neighbor[0][0]),
+ recvRankF_(dd->neighbor[0][1]),
+ usePBC_(dd->ci[dd->dim[0]] == 0),
+ haloDataTransferLaunched_(new GpuEventSynchronizer()),
+ mpi_comm_mysim_(mpi_comm_mysim),
+ nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
+ coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+{
+
+ GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
+
+ if (dd->ndim > 1)
+ {
+ gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange");
+ }
+
+ if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
+ {
+ gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
+ }
+
+ changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
+
+ allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+
+}
+
+GpuHaloExchange::Impl::~Impl()
+{
+ freeDeviceBuffer(&d_indexMap_);
+ freeDeviceBuffer(&d_sendBuf_);
+ freeDeviceBuffer(&d_recvBuf_);
+ freeDeviceBuffer(&d_fShift_);
+ delete haloDataTransferLaunched_;
+}
+
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
+ MPI_Comm mpi_comm_mysim,
+ void *nonLocalStream,
+ void *coordinatesOnDeviceEvent)
+ : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+{
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer)
+{
+ impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+}
+
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+{
+ impl_->communicateHaloCoordinates(box);
+}
+
+} //namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation of GPU Halo Exchange.
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/hostallocator.h"
+
+namespace gmx
+{
+
+/*! \brief switch for whether coordinates or force halo is being applied */
+enum class HaloQuantity
+{
+ HaloCoordinates, HaloForces
+};
+
+/*! \internal \brief Class with interfaces and data for GPU Halo Exchange */
+class GpuHaloExchange::Impl
+{
+
+ public:
+ /*! \brief Creates GPU Halo Exchange object.
+ *
+ * \param [inout] dd domdec structure
+ * \param [in] mpi_comm_mysim communicator used for simulation
+ * \param [in] nonLocalStream non-local NB CUDA stream
+ * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+ */
+ Impl(gmx_domdec_t *dd,
+ MPI_Comm mpi_comm_mysim,
+ void *nonLocalStream,
+ void *coordinatesOnDeviceEvent);
+ ~Impl();
+
+ /*! \brief
+ * (Re-) Initialization for GPU halo exchange
+ * \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory
+ */
+ void reinitHalo(float3 *d_coordinatesBuffer);
+
+
+ /*! \brief
+ * GPU halo exchange of coordinates buffer
+ * \param [in] box Coordinate box (from which shifts will be constructed)
+ */
+ void communicateHaloCoordinates(const matrix box);
+
+ private:
+
+ /*! \brief Data transfer wrapper for GPU halo exchange
+ * \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory
+ * \param [in] haloQuantity switch on whether X or F halo exchange is being performed
+ */
+ void communicateHaloData(float3 *d_ptr,
+ HaloQuantity haloQuantity);
+
+ /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
+ * \param [inout] sendPtr address to send data from
+ * \param [in] sendSize number of atoms to be sent
+ * \param [in] sendRank rank to send data to
+ * \param [inout] remotePtr remote address to recv data
+ * \param [in] recvRank rank to recv data from
+ */
+ void communicateHaloDataWithCudaDirect(void *sendPtr,
+ int sendSize,
+ int sendRank,
+ void* remotePtr,
+ int recvRank);
+
+ //! Domain decomposition object
+ gmx_domdec_t *dd_ = nullptr;
+ //! map of indices to be sent from this rank
+ gmx::HostVector<int> h_indexMap_;
+ //! device copy of index map
+ int *d_indexMap_ = nullptr;
+ //! number of elements in index map array
+ int indexMapSize_ = -1;
+ //! number of elements allocated in index map array
+ int indexMapSizeAlloc_ = -1;
+ //! device buffer for sending packed data
+ float3 *d_sendBuf_ = nullptr;
+ //! number of atoms in sendbuf array
+ int sendBufSize_ = -1;
+ //! number of atoms allocated in sendbuf array
+ int sendBufSizeAlloc_ = -1;
+ //! device buffer for receiving packed data
+ float3 *d_recvBuf_ = nullptr;
+ //! maximum size of packed buffer
+ int maxPackedBufferSize_ = 0;
+ //! number of atoms in recvbuf array
+ int recvBufSize_ = -1;
+ //! number of atoms allocated in recvbuf array
+ int recvBufSizeAlloc_ = -1;
+ //! rank to send data to for X
+ int sendRankX_ = 0;
+ //! rank to recv data from for X
+ int recvRankX_ = 0;
+ //! rank to send data to for F
+ int sendRankF_ = 0;
+ //! rank to recv data from for F
+ int recvRankF_ = 0;
+ //! send copy size from this rank for X
+ int xSendSize_ = 0;
+ //! recv copy size to this rank for X
+ int xRecvSize_ = 0;
+ //! send copy size from this rank for F
+ int fSendSize_ = 0;
+ //! recv copy size to this rank for F
+ int fRecvSize_ = 0;
+ //! offset of local halo region
+ int localOffset_ = 0;
+ //! remote GPU coordinates buffer pointer for pushing data
+ void *remoteXPtr_ = 0;
+ //! remote GPU force buffer pointer for pushing data
+ void *remoteFPtr_ = 0;
+ //! Periodic Boundary Conditions for this rank
+ bool usePBC_ = false;
+ //! force shift buffer on device
+ float3 * d_fShift_ = nullptr;
+ //! Event triggered when halo transfer has been launched with direct CUD memory copy
+ GpuEventSynchronizer *haloDataTransferLaunched_ = nullptr;
+ //! MPI communicator used for simulation
+ MPI_Comm mpi_comm_mysim_;
+ //! CUDA stream for non-local non-bonded calculations
+ cudaStream_t nonLocalStream_ = nullptr;
+ //! Event triggered when coordinates have been copied to device
+ GpuEventSynchronizer *coordinatesOnDeviceEvent_ = nullptr;
+ //! full coordinates buffer in GPU memory
+ float3 *d_x_ = nullptr;
+
+};
+
+} // namespace gmx
+
+#endif
#include "gromacs/domdec/dlbtiming.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
// TODO eventially tie this in with other existing GPU flags.
static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+ && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
{
const int end = forceToAdd.size();
launchPmeGpuFftAndGather(fr->pmedata, wcycle);
}
+ const bool ddUsesGpuDirectCommunication
+ = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr);
+ gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr;
+ GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr,
+ "Must have valid gpuHaloExchange when doing halo exchange on the GPU");
+
/* Communicate coordinates and sum dipole if necessary +
do non-local pair search */
if (havePPDomainDecomposition(cr))
nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
+ if (ddUsesGpuDirectCommunication)
+ {
+ rvec* d_x = static_cast<rvec *> (nbv->get_gpu_xrvec());
+ gpuHaloExchange->reinitHalo(d_x);
+ }
}
else
{
- dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+ if (ddUsesGpuDirectCommunication)
+ {
+ // The following must be called after local setCoordinates (which records an event
+ // when the coordinate data has been copied to the device).
+ gpuHaloExchange->communicateHaloCoordinates(box);
+
+ // TODO Force flags should include haveFreeEnergyWork for this domain
+ if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+ {
+ //non-local part of coordinate buffer must be copied back to host for CPU work
+ nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+ }
+ }
+ else
+ {
+ dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+ }
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
// The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
- if (!useGpuPme)
+ if (!useGpuPme && !ddUsesGpuDirectCommunication)
{
nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
x.unpaddedArrayRef());
update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
}
+ // TODO Force flags should include haveFreeEnergyWork for this domain
+ if (ddUsesGpuDirectCommunication &&
+ (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+ {
+ /* Wait for non-local coordinate data to be copied from device */
+ nbv->wait_nonlocal_x_copy_D2H_done();
+ }
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fr, inputrec, &(top->idef),
cr, ms, nrnb, wcycle, mdatoms,
forceFlags, &forceOut.forceWithVirial(), enerd,
ed, forceFlags.doNeighborSearch);
- bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
- bool haveCpuForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuFPmeReduction);
+ const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+ // TODO Force flags should include haveFreeEnergyWork for this domain
+ const bool haveCpuForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork ||
+ useCpuFPmeReduction || (fr->efep != efepNO));
// Will store the amount of cycles spent waiting for the GPU that
// will be later used in the DLB accounting.
#include "gromacs/commandline/filenm.h"
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/localatomsetmanager.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/ewald/ewald_utils.h"
namespace gmx
{
-/*! \brief Log if development feature flags are encountered
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+ && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
+/*! \brief Manage any development feature flag variables encountered
*
- * The use of dev features indicated by environment variables is logged
- * in order to ensure that runs with such featrues enabled can be identified
- * from their log and standard output.
+ * The use of dev features indicated by environment variables is
+ * logged in order to ensure that runs with such featrues enabled can
+ * be identified from their log and standard output. Any cross
+ * dependencies are also checked, and if unsatisified, a fatal error
+ * issued.
*
* \param[in] mdlog Logger object.
*/
-static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
{
const bool enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
+ const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
if (enableGpuBufOps)
{
"NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
}
+ if (enableGpuHaloExchange)
+ {
+ if (!enableGpuBufOps)
+ {
+ gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
+ }
+ GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+ "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
+ }
+
if (useGpuUpdateConstrain)
{
GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
"NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
}
+
}
/*! \brief Barrier for safe simultaneous thread access to mdrunner data
gmx::MDLogger mdlog(logOwner.logger());
// report any development features that may be enabled by environment variables
- reportDevelopmentFeatures(mdlog);
+ manageDevelopmentFeatures(mdlog);
// With thread-MPI, the communicator changes after threads are
// launched, so this is rebuilt for the master rank at that
pforce,
wcycle);
+ // TODO Move this to happen during domain decomposition setup,
+ // once stream and event handling works well with that.
+ if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
+ {
+ void *stream = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+ void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
+ cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+ }
+
/* Initialize the mdAtoms structure.
* mdAtoms is not filled with atom data,
* as this can not be done now with domain decomposition.
#include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu"
#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
-
namespace Nbnxm
{
copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
stream, GpuApiCallBehavior::Async, nullptr);
+ if (interactionLoc == Nbnxm::InteractionLocality::Local)
+ {
+ nb->xAvailableOnDevice->markEvent(stream);
+ }
+
if (bDoTime)
{
nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
return;
}
+void nbnxn_launch_copy_x_from_gpu(const AtomLocality atomLocality,
+ const Nbnxm::GridSet &gridSet,
+ gmx_nbnxn_gpu_t *nb,
+ rvec *x)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ GMX_ASSERT(x, "Need a valid x pointer");
+
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+ cudaStream_t stream = nb->stream[iLocality];
+
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *t = nb->timers;
+ int atomStart, nAtoms;
+
+ nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+ }
+
+ GMX_ASSERT(nb->xrvec, "Need a valid nb->xrvec pointer");
+ rvec *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
+ rvec *ptrSrc = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
+ copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+ stream, GpuApiCallBehavior::Async, stream);
+
+ if (atomLocality == AtomLocality::NonLocal)
+ {
+ nb->xNonLocalCopyD2HDone->markEvent(stream);
+ }
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+ }
+
+ return;
+}
+
void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality,
gmx_nbnxn_gpu_t *nb)
{
}
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+ return static_cast<void *> (gpu_nbv->xrvec);
+}
+
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb)
+{
+ return static_cast<void*> (nb->xAvailableOnDevice);
+}
+
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t *nb)
+{
+ nb->xNonLocalCopyD2HDone->waitForEvent();
+}
+
} // namespace Nbnxm
// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/math/vectypes.h"
stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
+ nb->xAvailableOnDevice = new GpuEventSynchronizer();
+ nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
/* WARNING: CUDA timings are incorrect with multiple streams.
* This is the main reason why they are disabled by default.
*/
const int atomIndicesSize = gridSet.atomIndices().size();
const int *cxy_na = grid.cxy_na().data();
const int *cxy_ind = grid.cxy_ind().data();
+ // TODO Should be done once per gridset
const int numRealAtomsTotal = gridSet.numRealAtomsTotal();
reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
*/
typedef struct Nbnxm::gpu_timers_t cu_timers_t;
+class GpuEventSynchronizer;
+
/** \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+ GpuEventSynchronizer *xAvailableOnDevice; /**< event triggered when
+ coordinate buffer has been
+ copied to device by PP task and
+ any dependent task (e.g. transfer of coordinates
+ to the PME rank's GPU) can proceed. */
+
+ GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
+ non-local coordinate buffer has been
+ copied from device to host*/
+
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
* Timer init/uninit is still done even with timing off so only the condition
f);
}
+void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
+{
+ nbnxn_launch_copy_x_from_gpu(locality,
+ pairSearch_->gridSet(),
+ gpu_nbv,
+ x);
+}
+
void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
{
nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
}
+void* nonbonded_verlet_t::get_gpu_xrvec()
+{
+ return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
+}
+
+void* nonbonded_verlet_t::get_x_on_device_event()
+{
+ return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
+}
+
+void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
+{
+ Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
+}
+
/*! \endcond */
/*! \brief D2H transfer of force buffer*/
void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+ /*! \brief D2H transfer of coordinate buffer*/
+ void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
/*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
*
* FIXME: need more details: when should be called / after which operation, etc.
*/
void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
+ /*! \brief return GPU pointer to x in rvec format */
+ void* get_gpu_xrvec();
+
+ /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
+ void* get_x_on_device_event();
+
+ /*! \brief Wait for non-local copy of coordinate buffer from device to host */
+ void wait_nonlocal_x_copy_D2H_done();
+
//! Return the kernel setup
const Nbnxm::KernelSetup &kernelSetup() const
{
gmx_nbnxn_gpu_t gmx_unused *nb,
rvec gmx_unused *f) CUDA_FUNC_TERM;
+/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
+ * \param[in] atomLocality Locality for data trasnfer
+ * \param[in] gridSet The Grid Set data object
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[out] x Coordinate buffer on CPU
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_x_from_gpu(AtomLocality gmx_unused atomLocality,
+ const Nbnxm::GridSet gmx_unused &gridSet,
+ gmx_nbnxn_gpu_t gmx_unused *nb,
+ rvec gmx_unused *x) CUDA_FUNC_TERM;
+
/*! \brief Wait for GPU stream to complete */
CUDA_FUNC_QUALIFIER
void nbnxn_wait_for_gpu_force_reduction(AtomLocality gmx_unused atomLocality,
gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
+/*! \brief sync CPU thread on coordinate copy to device
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
-} // namespace Nbnxm
+/*! \brief return pointer to event recorded when coordinates have been copied to device
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief return GPU pointer to x in rvec format
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Wait for non-local copy of coordinate buffer from device to host
+ * \param[in] nb The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM;
+} // namespace Nbnxm
#endif