GPU halo exchange
authorAlan Gray <alang@nvidia.com>
Tue, 26 Feb 2019 12:43:22 +0000 (04:43 -0800)
committerMark Abraham <mark.j.abraham@gmail.com>
Mon, 16 Sep 2019 13:12:49 +0000 (15:12 +0200)
Activate with GMX_GPU_DD_COMMS and GMX_USE_GPU_BUFFER_OPS environment
variable.

Class to initialize and apply coordinate buffer halo exchange
functionality directly on GPU memory space.

Currently only supports direct cuda memcpy, and relies on thread MPI
being in use.

Updated gpucomm testing matrices to cover non-GPU case.

Limitation: still only supports thread MPI, 1D data decomposition and
only coordinate halo exchange

Implements part of #2890
Associated with #2915

Change-Id: I8e6473481ad4d943df78d7019681bfa821bd5798

16 files changed:
admin/builds/gpucomm-matrix.txt
src/gromacs/domdec/CMakeLists.txt
src/gromacs/domdec/domdec.cpp
src/gromacs/domdec/domdec_struct.h
src/gromacs/domdec/gpuhaloexchange.h [new file with mode: 0644]
src/gromacs/domdec/gpuhaloexchange_impl.cpp [new file with mode: 0755]
src/gromacs/domdec/gpuhaloexchange_impl.cu [new file with mode: 0644]
src/gromacs/domdec/gpuhaloexchange_impl.cuh [new file with mode: 0644]
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index a189c38786e6204673e59c3610d016548a3b77c8..ef7895fe30e6fedff9ebcff4367ec8d53943148f 100644 (file)
 
 # Test an older version of CUDA
 # Test MPI with CUDA
-# Test MPMD PME with thread-MPI (library MPI not supported initially)
+# Test multiple PP ranks with separate PME rank
+# Test with thread-MPI (library MPI not supported initially)
 # Test GPU comm features in the above combination
-gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=2 thread-mpi openmp gpucomm
+gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=3 thread-mpi openmp gpucomm
 
 # Test newest gcc supported by newest CUDA at time of release
+# Test multiple PP ranks without separate PME rank
 # Test thread-MPI with CUDA
 # Test GPU comm features in the above combination
-gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
+gcc-8 gpuhw=nvidia nranks=2 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
 
 # Test non-default use of mdrun -gpu_id
 # Test GPU-sharing among 4 PP ranks
@@ -36,3 +38,6 @@ gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpucomm
 # Test OpenCL build with gpudev features
 # Test GPU comm on the OpenCL path where it is unsupported
 clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpucomm
+
+# Test a non-GPU build with the gpucomm optiom set
+gcc-8 double x11 no-tng fftpack simd=sse4.1 gpucomm
index 1a17c41300fb5f0cfb1eaf3f5cf08da873a0a568..c4fca1e308a2c540ee356a939067a407ad91d719 100644 (file)
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2014,2018, by the GROMACS development team, led by
+# Copyright (c) 2014,2018,2019, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
 # the research papers on the package. Check out http://www.gromacs.org.
 
 file(GLOB DOMDEC_SOURCES *.cpp)
-set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} PARENT_SCOPE)
+
+if(GMX_USE_CUDA)
+  file(GLOB DOMDEC_CUDA_SOURCES gpuhaloexchange_impl.cu)
+endif()
+
+set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} ${DOMDEC_CUDA_SOURCES} PARENT_SCOPE)
 
 if (BUILD_TESTING)
     add_subdirectory(tests)
index 5c76700e8a6dc7c358174b48bd706758ad1d97b6..f02b37a39c652fcfee12a09df8d8493f3105bf9b 100644 (file)
@@ -55,6 +55,7 @@
 #include "gromacs/domdec/dlbtiming.h"
 #include "gromacs/domdec/domdec_network.h"
 #include "gromacs/domdec/ga2la.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/options.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/gmxlib/network.h"
@@ -3016,7 +3017,13 @@ getDDSettings(const gmx::MDLogger     &mdlog,
 
     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
-    ddSettings.request1DAnd1Pulse  = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
+    // TODO GPU halo exchange requires a 1D single-pulse DD, and when
+    // it is properly integrated the hack with GMX_GPU_DD_COMMS should
+    // be removed.
+    ddSettings.request1DAnd1Pulse  = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) ||
+                                      (bool(getenv("GMX_GPU_DD_COMMS") != nullptr &&
+                                            GMX_THREAD_MPI &&
+                                            (GMX_GPU == GMX_GPU_CUDA))));
     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
index 88bc93be50d35f026301a1b0c507c73dff1987c4..60bd04913d39482cbe185ec374db263f475d45f6 100644 (file)
@@ -74,6 +74,7 @@ namespace gmx
 {
 template <typename T> class HashedMap;
 class LocalAtomSetManager;
+class GpuHaloExchange;
 }
 
 typedef struct {
@@ -226,6 +227,9 @@ struct gmx_domdec_t { //NOLINT(clang-analyzer-optin.performance.Padding)
 
     /* gmx_pme_recv_f buffer */
     std::vector<gmx::RVec> pmeForceReceiveBuffer;
+
+    /* GPU halo exchange object */
+    std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
 };
 
 //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
new file mode 100644 (file)
index 0000000..58f908b
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declaration of GPU halo exchange.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ * \inlibraryapi
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_H
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/gmxmpi.h"
+
+struct gmx_domdec_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages GPU Halo Exchange object */
+class GpuHaloExchange
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * Halo exchange will be performed in \c streamNonLocal, and
+         * the main communicateHaloCoordinates method must be called
+         * before any subsequent operations that access non-local
+         * parts of the coordinate buffer (such as the non-local
+         * non-bonded kernels). It also must be called after the local
+         * coordinates buffer operations (where the coordinates are
+         * copied to the device and hence the \c
+         * coordinatesOnDeviceEvent is recorded).
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    streamNonLocal           non-local NB CUDA stream.
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        GpuHaloExchange(gmx_domdec_t *dd,
+                        MPI_Comm      mpi_comm_mysim,
+                        void         *streamNonLocal,
+                        void         *coordinatesOnDeviceEvent);
+        ~GpuHaloExchange();
+
+        /*! \brief
+         *
+         * Initialization for GPU halo exchange of coordinates buffer
+         * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(rvec *d_coordinateBuffer);
+
+
+        /*! \brief GPU halo exchange of coordinates buffer.
+         *
+         * Must be called after local setCoordinates (which records an
+         * event when the coordinate data has been copied to the
+         * device).
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} //namespace gmx
+
+#endif
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
new file mode 100755 (executable)
index 0000000..4b491ea
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement Domdec CUDA interfaces for non-GPU builds.
+ *
+ * Currently, reports and exits if any of the interfaces are called.
+ * Needed to satisfy compiler on systems, where CUDA is not available.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub. */
+class GpuHaloExchange::Impl
+{
+};
+
+/*!\brief Constructor stub. */
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
+                                 MPI_Comm       /* mpi_comm_mysim */,
+                                 void         * /*streamNonLocal */,
+                                 void         * /*coordinatesOnDeviceEvent*/)
+    : impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+/*!\brief init halo exhange stub. */
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+/*!\brief apply X halo exchange stub. */
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
+}
+
+
+}      // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
new file mode 100644 (file)
index 0000000..4e8ba35
--- /dev/null
@@ -0,0 +1,344 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU halo exchange using CUDA.
+ *
+ *
+ * \author Alan Gray <alang@nvidia.com.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "gpuhaloexchange_impl.cuh"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "domdec_internal.h"
+
+namespace gmx
+{
+
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_threadsPerBlock = 256;
+
+template <bool usePBC>
+__global__ void packSendBufKernel(float3 * __restrict__       dataPacked,
+                                  const float3 * __restrict__ data,
+                                  const int * __restrict__    map,
+                                  const int                   mapSize,
+                                  const float3                coordinateShift)
+{
+    int           threadIndex          = blockIdx.x*blockDim.x+threadIdx.x;
+    float3       *gm_dataDest          = &dataPacked[threadIndex];
+    const float3 *gm_dataSrc           = &data[map[threadIndex]];
+
+    if (threadIndex < mapSize)
+    {
+        if (usePBC)
+        {
+            *gm_dataDest = *gm_dataSrc + coordinateShift;
+        }
+        else
+        {
+            *gm_dataDest = *gm_dataSrc;
+        }
+
+    }
+
+    return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
+{
+
+    d_x_ = d_coordinatesBuffer;
+
+    cudaStream_t                  stream            = nonLocalStream_;
+    int                           nzone             = 1;
+    const gmx_domdec_comm_t      &comm              = *dd_->comm;
+    const gmx_domdec_comm_dim_t  &cd                = comm.cd[0];
+    const gmx_domdec_ind_t       &ind               = cd.ind[0];
+    int                           newSize           = ind.nsend[nzone+1];
+
+    GMX_RELEASE_ASSERT(cd.numPulses() == 1, "Multiple pulses are not yet supported in GPU halo exchange");
+    GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
+
+    // reallocates only if needed
+    h_indexMap_.resize(newSize);
+    // reallocate on device only if needed
+    if (newSize > maxPackedBufferSize_)
+    {
+        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+        maxPackedBufferSize_ = newSize;
+    }
+
+    xSendSize_ = newSize;
+    MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0,
+                 &xRecvSize_, sizeof(int), MPI_BYTE, recvRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+    fSendSize_ = xRecvSize_;
+    fRecvSize_ = xSendSize_;
+
+    localOffset_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
+
+    GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
+    std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
+
+    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, GpuApiCallBehavior::Async, nullptr);
+
+    // This rank will push data to its neighbor, so needs to know
+    // the remote receive address and similarly send its receive
+    // address to other neighbour. We can do this here in reinit fn
+    // since the pointers will not change until the next NS step.
+
+    //Coordinates buffer:
+    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
+                 &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    //Force buffer:
+    recvPtr  = static_cast<void*> (d_recvBuf_);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0,
+                 &remoteFPtr_, sizeof(void*), MPI_BYTE, sendRankF_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+
+    return;
+}
+
+// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
+// event when the coordinate data has been copied to the device).
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+{
+
+    //ensure stream waits until coordinate data is available on device
+    coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+
+    // launch kernel to pack send buffer
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (xSendSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = nonLocalStream_;
+
+    const float3     *sendBuf  = d_sendBuf_;
+    const float3     *d_x      = d_x_;
+    const int        *indexMap = d_indexMap_;
+    const int         size     = xSendSize_;
+    // The coordinateShift changes between steps when we have
+    // performed a DD partition, or have updated the box e.g. when
+    // performing pressure coupling. So, for simplicity, the the box
+    // is used every step to pass the shift vector as an argument of
+    // the packing kernel.
+    //
+    // Because only one-dimensional DD is supported, the coordinate
+    // shift only needs to handle that dimension.
+    const int         dimensionIndex = dd_->dim[0];
+    const float3      coordinateShift {
+        box[dimensionIndex][XX], box[dimensionIndex][YY], box[dimensionIndex][ZZ]
+    };
+
+    // Avoid launching kernel when there is no work to do
+    if (size > 0)
+    {
+        auto              kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
+
+        const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap,
+                                                                   &size, &coordinateShift);
+
+        launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+    }
+
+    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+
+    return;
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
+                                                HaloQuantity haloQuantity)
+{
+
+    void * sendPtr;
+    int    sendSize;
+    void * remotePtr;
+    int    sendRank;
+    int    recvRank;
+    if (haloQuantity == HaloQuantity::HaloCoordinates)
+    {
+        sendPtr   = static_cast<void*> (d_sendBuf_);
+        sendSize  = xSendSize_;
+        remotePtr = remoteXPtr_;
+        sendRank  = sendRankX_;
+        recvRank  = recvRankX_;
+    }
+    else
+    {
+        sendPtr   = static_cast<void*> (&(d_ptr[localOffset_]));
+        sendSize  = fSendSize_;
+        remotePtr = remoteFPtr_;
+        sendRank  = sendRankF_;
+        recvRank  = recvRankF_;
+    }
+
+    communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
+                                                              int   sendSize,
+                                                              int   sendRank,
+                                                              void *remotePtr,
+                                                              int   recvRank)
+{
+
+    cudaError_t  stat;
+
+    // We asynchronously push data to remote rank. The remote
+    // destination pointer has already been set in the init fn.  We
+    // don't need to worry about overwriting data the remote ranks
+    // still needs since the halo exchange is just done once per
+    // timestep, for each of X and F.
+
+    // send data to neighbor, if any data exists to send
+    if (sendSize > 0)
+    {
+        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+        CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
+    }
+
+    //ensure pushed data has arrived before remote rank progresses
+    // This rank records an event and sends it to the remote rank which has just been pushed data.
+    // This rank recieves event from remote rank which has pushed data here, and enqueues that event to
+    // its stream.
+    GpuEventSynchronizer *haloDataTransferRemote;
+
+    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+
+    MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
+                 &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+
+}
+
+/*! \brief Create Domdec GPU object */
+GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
+                            MPI_Comm      mpi_comm_mysim,
+                            void        * nonLocalStream,
+                            void        * coordinatesOnDeviceEvent)
+    : dd_(dd),
+      sendRankX_(dd->neighbor[0][1]),
+      recvRankX_(dd->neighbor[0][0]),
+      sendRankF_(dd->neighbor[0][0]),
+      recvRankF_(dd->neighbor[0][1]),
+      usePBC_(dd->ci[dd->dim[0]] == 0),
+      haloDataTransferLaunched_(new GpuEventSynchronizer()),
+      mpi_comm_mysim_(mpi_comm_mysim),
+      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
+      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+{
+
+    GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
+
+    if (dd->ndim > 1)
+    {
+        gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange");
+    }
+
+    if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
+    {
+        gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
+    }
+
+    changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
+
+    allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+
+}
+
+GpuHaloExchange::Impl::~Impl()
+{
+    freeDeviceBuffer(&d_indexMap_);
+    freeDeviceBuffer(&d_sendBuf_);
+    freeDeviceBuffer(&d_recvBuf_);
+    freeDeviceBuffer(&d_fShift_);
+    delete haloDataTransferLaunched_;
+}
+
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
+                                 MPI_Comm      mpi_comm_mysim,
+                                 void         *nonLocalStream,
+                                 void         *coordinatesOnDeviceEvent)
+    : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+{
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer)
+{
+    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+}
+
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+{
+    impl_->communicateHaloCoordinates(box);
+}
+
+} //namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
new file mode 100644 (file)
index 0000000..bd12565
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation of GPU Halo Exchange.
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/hostallocator.h"
+
+namespace gmx
+{
+
+/*! \brief switch for whether coordinates or force halo is being applied */
+enum class HaloQuantity
+{
+    HaloCoordinates, HaloForces
+};
+
+/*! \internal \brief Class with interfaces and data for GPU Halo Exchange */
+class GpuHaloExchange::Impl
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    nonLocalStream           non-local NB CUDA stream
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        Impl(gmx_domdec_t *dd,
+             MPI_Comm mpi_comm_mysim,
+             void *nonLocalStream,
+             void *coordinatesOnDeviceEvent);
+        ~Impl();
+
+        /*! \brief
+         * (Re-) Initialization for GPU halo exchange
+         * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(float3 *d_coordinatesBuffer);
+
+
+        /*! \brief
+         * GPU halo exchange of coordinates buffer
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+
+        /*! \brief Data transfer wrapper for GPU halo exchange
+         * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
+         * \param [in] haloQuantity  switch on whether X or F halo exchange is being performed
+         */
+        void communicateHaloData(float3      *d_ptr,
+                                 HaloQuantity haloQuantity);
+
+        /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
+         * \param [inout] sendPtr    address to send data from
+         * \param [in] sendSize      number of atoms to be sent
+         * \param [in] sendRank      rank to send data to
+         * \param [inout] remotePtr  remote address to recv data
+         * \param [in] recvRank      rank to recv data from
+         */
+        void communicateHaloDataWithCudaDirect(void *sendPtr,
+                                               int   sendSize,
+                                               int   sendRank,
+                                               void* remotePtr,
+                                               int   recvRank);
+
+        //! Domain decomposition object
+        gmx_domdec_t               *dd_                       = nullptr;
+        //! map of indices to be sent from this rank
+        gmx::HostVector<int>        h_indexMap_;
+        //! device copy of index map
+        int                        *d_indexMap_               = nullptr;
+        //! number of elements in index map array
+        int                         indexMapSize_             = -1;
+        //! number of elements allocated in index map array
+        int                         indexMapSizeAlloc_        = -1;
+        //! device buffer for sending packed data
+        float3                     *d_sendBuf_ = nullptr;
+        //! number of atoms in sendbuf array
+        int                         sendBufSize_              = -1;
+        //! number of atoms allocated in sendbuf array
+        int                         sendBufSizeAlloc_         = -1;
+        //! device buffer for receiving packed data
+        float3                     *d_recvBuf_                = nullptr;
+        //! maximum size of packed buffer
+        int                         maxPackedBufferSize_      = 0;
+        //! number of atoms in recvbuf array
+        int                         recvBufSize_              = -1;
+        //! number of atoms allocated in recvbuf array
+        int                         recvBufSizeAlloc_         = -1;
+        //! rank to send data to for X
+        int                         sendRankX_                = 0;
+        //! rank to recv data from for X
+        int                         recvRankX_                = 0;
+        //! rank to send data to for F
+        int                         sendRankF_                = 0;
+        //! rank to recv data from for F
+        int                         recvRankF_                = 0;
+        //! send copy size from this rank for X
+        int                         xSendSize_                = 0;
+        //! recv copy size to this rank for X
+        int                         xRecvSize_                = 0;
+        //! send copy size from this rank for F
+        int                         fSendSize_                = 0;
+        //! recv copy size to this rank for F
+        int                         fRecvSize_                = 0;
+        //! offset of local halo region
+        int                         localOffset_              = 0;
+        //! remote GPU coordinates buffer pointer for pushing data
+        void                       *remoteXPtr_               = 0;
+        //! remote GPU force buffer pointer for pushing data
+        void                       *remoteFPtr_               = 0;
+        //! Periodic Boundary Conditions for this rank
+        bool                        usePBC_                   = false;
+        //! force shift buffer on device
+        float3 *                    d_fShift_                 = nullptr;
+        //! Event triggered when halo transfer has been launched with direct CUD memory copy
+        GpuEventSynchronizer       *haloDataTransferLaunched_ = nullptr;
+        //! MPI communicator used for simulation
+        MPI_Comm                    mpi_comm_mysim_;
+        //! CUDA stream for non-local non-bonded calculations
+        cudaStream_t                nonLocalStream_           = nullptr;
+        //! Event triggered when coordinates have been copied to device
+        GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
+        //! full coordinates buffer in GPU memory
+        float3                     *d_x_                      = nullptr;
+
+};
+
+} // namespace gmx
+
+#endif
index c540fa86f6d2d390ceec96aa0d212649b46daf5b..e2fffacd83c6c0366c9d9e6752417305940f9203 100644 (file)
@@ -49,6 +49,7 @@
 #include "gromacs/domdec/dlbtiming.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
@@ -125,6 +126,10 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP
 // TODO eventially tie this in with other existing GPU flags.
 static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
     const int      end = forceToAdd.size();
@@ -1180,6 +1185,12 @@ void do_force(FILE                                     *fplog,
         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
+    const bool            ddUsesGpuDirectCommunication
+        = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr);
+    gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr;
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr,
+               "Must have valid gpuHaloExchange when doing halo exchange on the GPU");
+
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
     if (havePPDomainDecomposition(cr))
@@ -1196,15 +1207,36 @@ void do_force(FILE                                     *fplog,
             nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
+            if (ddUsesGpuDirectCommunication)
+            {
+                rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
+                gpuHaloExchange->reinitHalo(d_x);
+            }
         }
         else
         {
-            dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            if (ddUsesGpuDirectCommunication)
+            {
+                // The following must be called after local setCoordinates (which records an event
+                // when the coordinate data has been copied to the device).
+                gpuHaloExchange->communicateHaloCoordinates(box);
+
+                // TODO Force flags should include haveFreeEnergyWork for this domain
+                if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+                {
+                    //non-local part of coordinate buffer must be copied back to host for CPU work
+                    nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+                }
+            }
+            else
+            {
+                dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            }
 
             if (useGpuXBufOps == BufferOpsUseGpu::True)
             {
                 // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
-                if (!useGpuPme)
+                if (!useGpuPme && !ddUsesGpuDirectCommunication)
                 {
                     nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
                                               x.unpaddedArrayRef());
@@ -1403,6 +1435,13 @@ void do_force(FILE                                     *fplog,
         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
     }
 
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    if (ddUsesGpuDirectCommunication &&
+        (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+    {
+        /* Wait for non-local coordinate data to be copied from device */
+        nbv->wait_nonlocal_x_copy_D2H_done();
+    }
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fr, inputrec, &(top->idef),
                       cr, ms, nrnb, wcycle, mdatoms,
@@ -1419,8 +1458,10 @@ void do_force(FILE                                     *fplog,
                          forceFlags, &forceOut.forceWithVirial(), enerd,
                          ed, forceFlags.doNeighborSearch);
 
-    bool                   useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-    bool                   haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuFPmeReduction);
+    const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    const bool haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork ||
+                                      useCpuFPmeReduction || (fr->efep != efepNO));
 
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
index 5caa3caee57552048aceb65916164c6917b013f2..68348803b2af02f99f7ab944f2b0c197b23363c1 100644 (file)
@@ -59,6 +59,7 @@
 #include "gromacs/commandline/filenm.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/localatomsetmanager.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/ewald/ewald_utils.h"
 namespace gmx
 {
 
-/*! \brief Log if development feature flags are encountered
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
+/*! \brief Manage any development feature flag variables encountered
  *
- * The use of dev features indicated by environment variables is logged
- * in order to ensure that runs with such featrues enabled can be identified
- * from their log and standard output.
+ * The use of dev features indicated by environment variables is
+ * logged in order to ensure that runs with such featrues enabled can
+ * be identified from their log and standard output. Any cross
+ * dependencies are also checked, and if unsatisified, a fatal error
+ * issued.
  *
  * \param[in]  mdlog        Logger object.
  */
-static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
 {
     const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
     const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
+    const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 
     if (enableGpuBufOps)
     {
@@ -180,11 +188,22 @@ static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
                 "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
     }
 
+    if (enableGpuHaloExchange)
+    {
+        if (!enableGpuBufOps)
+        {
+            gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
+        }
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
+    }
+
     if (useGpuUpdateConstrain)
     {
         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                 "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
     }
+
 }
 
 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
@@ -654,7 +673,7 @@ int Mdrunner::mdrunner()
     gmx::MDLogger    mdlog(logOwner.logger());
 
     // report any development features that may be enabled by environment variables
-    reportDevelopmentFeatures(mdlog);
+    manageDevelopmentFeatures(mdlog);
 
     // With thread-MPI, the communicator changes after threads are
     // launched, so this is rebuilt for the master rank at that
@@ -1316,6 +1335,15 @@ int Mdrunner::mdrunner()
                       pforce,
                       wcycle);
 
+        // TODO Move this to happen during domain decomposition setup,
+        // once stream and event handling works well with that.
+        if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
+        {
+            void *stream                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
+            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+        }
+
         /* Initialize the mdAtoms structure.
          * mdAtoms is not filled with atom data,
          * as this can not be done now with domain decomposition.
index 5c72c7faadf0bf24a875996bd35fc75b69fcb27e..9715d2985f1055b8cec5ce60f632652552681ce3 100644 (file)
 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu"
 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 
-
 namespace Nbnxm
 {
 
@@ -777,6 +776,11 @@ void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid               &grid,
     copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
                        stream, GpuApiCallBehavior::Async, nullptr);
 
+    if (interactionLoc == Nbnxm::InteractionLocality::Local)
+    {
+        nb->xAvailableOnDevice->markEvent(stream);
+    }
+
     if (bDoTime)
     {
         nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
@@ -1011,6 +1015,47 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
     return;
 }
 
+void nbnxn_launch_copy_x_from_gpu(const AtomLocality               atomLocality,
+                                  const Nbnxm::GridSet            &gridSet,
+                                  gmx_nbnxn_gpu_t                 *nb,
+                                  rvec                            *x)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(x,  "Need a valid x pointer");
+
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    bool                      bDoTime = nb->bDoTime;
+    cu_timers_t              *t       = nb->timers;
+    int                       atomStart, nAtoms;
+
+    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+    }
+
+    GMX_ASSERT(nb->xrvec,  "Need a valid nb->xrvec pointer");
+    rvec       *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
+    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
+    copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+                         stream, GpuApiCallBehavior::Async, stream);
+
+    if (atomLocality == AtomLocality::NonLocal)
+    {
+        nb->xNonLocalCopyD2HDone->markEvent(stream);
+    }
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+    }
+
+    return;
+}
+
 void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
                                         gmx_nbnxn_gpu_t                   *nb)
 {
@@ -1024,4 +1069,19 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomL
 
 }
 
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return static_cast<void *> (gpu_nbv->xrvec);
+}
+
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
+{
+    return static_cast<void*> (nb->xAvailableOnDevice);
+}
+
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t   *nb)
+{
+    nb->xNonLocalCopyD2HDone->waitForEvent();
+}
+
 } // namespace Nbnxm
index 551f5c22263f1cc16c5e3c8df349bc010d6504b7..bdeefcbb660965d271ad34ed41f5c3e477e041dd 100644 (file)
@@ -51,6 +51,7 @@
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/math/vectypes.h"
@@ -481,6 +482,9 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
     stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
 
+    nb->xAvailableOnDevice   = new GpuEventSynchronizer();
+    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
     /* WARNING: CUDA timings are incorrect with multiple streams.
      *          This is the main reason why they are disabled by default.
      */
@@ -899,6 +903,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
         const int           atomIndicesSize   = gridSet.atomIndices().size();
         const int          *cxy_na            = grid.cxy_na().data();
         const int          *cxy_ind           = grid.cxy_ind().data();
+        // TODO Should be done once per gridset
         const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
 
         reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
index c18cd3f9a24bee527b21d3ce5530a8abf9a30b4e..ec4b2b8d339592d85b5bd7e802dca928f001d1bb 100644 (file)
@@ -206,6 +206,8 @@ using cu_plist_t = Nbnxm::gpu_plist;
  */
 typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 
+class GpuEventSynchronizer;
+
 /** \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
@@ -278,6 +280,16 @@ struct gmx_nbnxn_cuda_t
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
 
 
+    GpuEventSynchronizer *xAvailableOnDevice;   /**< event triggered when
+                                                   coordinate buffer has been
+                                                   copied to device by PP task and
+                                                   any dependent task (e.g. transfer of coordinates
+                                                   to the PME rank's GPU) can proceed. */
+
+    GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
+                                                   non-local coordinate buffer has been
+                                                   copied from device to host*/
+
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
index 68eac205a431f5def1534b8db975dbeb39e98ebb..031033dea93e8e29df1595229e3612b87ae582b0 100644 (file)
@@ -302,9 +302,32 @@ void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocali
                                  f);
 }
 
+void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
+{
+    nbnxn_launch_copy_x_from_gpu(locality,
+                                 pairSearch_->gridSet(),
+                                 gpu_nbv,
+                                 x);
+}
+
 void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
 {
     nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
 }
 
+void* nonbonded_verlet_t::get_gpu_xrvec()
+{
+    return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
+}
+
+void* nonbonded_verlet_t::get_x_on_device_event()
+{
+    return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
+}
+
+void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
+{
+    Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
+}
+
 /*! \endcond */
index db550859dfffa7522a18482a09d1c28e0f1b8b8a..7a27a5cb4e10af0696e90c2d5dc87b30577c89c1 100644 (file)
@@ -388,12 +388,24 @@ struct nonbonded_verlet_t
         /*! \brief D2H transfer of force buffer*/
         void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
 
+        /*! \brief D2H transfer of coordinate buffer*/
+        void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
         /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
          *
          * FIXME: need more details: when should be called / after which operation, etc.
          */
         void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
 
+        /*! \brief return GPU pointer to x in rvec format */
+        void* get_gpu_xrvec();
+
+        /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
+        void* get_x_on_device_event();
+
+        /*! \brief Wait for non-local copy of coordinate buffer from device to host */
+        void wait_nonlocal_x_copy_D2H_done();
+
         //! Return the kernel setup
         const Nbnxm::KernelSetup &kernelSetup() const
         {
index 3794c98320b0abb8d1053168b5e39d3167609722..743bcffa2034cd69d35eb2ecf52d6d7c57488f4c 100644 (file)
@@ -370,12 +370,46 @@ void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocali
                                   gmx_nbnxn_gpu_t         gmx_unused *nb,
                                   rvec                    gmx_unused *f) CUDA_FUNC_TERM;
 
+/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
+ * \param[in]  atomLocality  Locality for data trasnfer
+ * \param[in]  gridSet       The Grid Set data object
+ * \param[in]  nb            The nonbonded data GPU structure
+ * \param[out] x             Coordinate buffer on CPU
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
+                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
+
 /*! \brief Wait for GPU stream to complete */
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
                                         gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
 
+/*! \brief sync CPU thread on coordinate copy to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
-}     // namespace Nbnxm
+/*! \brief return pointer to event recorded when coordinates have been copied to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief return GPU pointer to x in rvec format
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Wait for non-local copy of coordinate buffer from device to host
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
+} // namespace Nbnxm
 #endif