From: Alan Gray <alang@nvidia.com>
Date: Tue, 26 Feb 2019 12:43:22 +0000 (-0800)
Subject: GPU halo exchange
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=44f607d71afaea9e51bb9d1cc2bc132f34482f32;p=alexxy%2Fgromacs.git

GPU halo exchange

Activate with GMX_GPU_DD_COMMS and GMX_USE_GPU_BUFFER_OPS environment
variable.

Class to initialize and apply coordinate buffer halo exchange
functionality directly on GPU memory space.

Currently only supports direct cuda memcpy, and relies on thread MPI
being in use.

Updated gpucomm testing matrices to cover non-GPU case.

Limitation: still only supports thread MPI, 1D data decomposition and
only coordinate halo exchange

Implements part of #2890
Associated with #2915

Change-Id: I8e6473481ad4d943df78d7019681bfa821bd5798
---

diff --git a/admin/builds/gpucomm-matrix.txt b/admin/builds/gpucomm-matrix.txt
index a189c38786..ef7895fe30 100644
--- a/admin/builds/gpucomm-matrix.txt
+++ b/admin/builds/gpucomm-matrix.txt
@@ -13,14 +13,16 @@
 
 # Test an older version of CUDA
 # Test MPI with CUDA
-# Test MPMD PME with thread-MPI (library MPI not supported initially)
+# Test multiple PP ranks with separate PME rank
+# Test with thread-MPI (library MPI not supported initially)
 # Test GPU comm features in the above combination
-gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=2 thread-mpi openmp gpucomm
+gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=3 thread-mpi openmp gpucomm
 
 # Test newest gcc supported by newest CUDA at time of release
+# Test multiple PP ranks without separate PME rank
 # Test thread-MPI with CUDA
 # Test GPU comm features in the above combination
-gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
+gcc-8 gpuhw=nvidia nranks=2 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
 
 # Test non-default use of mdrun -gpu_id
 # Test GPU-sharing among 4 PP ranks
@@ -36,3 +38,6 @@ gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpucomm
 # Test OpenCL build with gpudev features
 # Test GPU comm on the OpenCL path where it is unsupported
 clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpucomm
+
+# Test a non-GPU build with the gpucomm optiom set
+gcc-8 double x11 no-tng fftpack simd=sse4.1 gpucomm
diff --git a/src/gromacs/domdec/CMakeLists.txt b/src/gromacs/domdec/CMakeLists.txt
index 1a17c41300..c4fca1e308 100644
--- a/src/gromacs/domdec/CMakeLists.txt
+++ b/src/gromacs/domdec/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2014,2018, by the GROMACS development team, led by
+# Copyright (c) 2014,2018,2019, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -33,7 +33,12 @@
 # the research papers on the package. Check out http://www.gromacs.org.
 
 file(GLOB DOMDEC_SOURCES *.cpp)
-set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} PARENT_SCOPE)
+
+if(GMX_USE_CUDA)
+  file(GLOB DOMDEC_CUDA_SOURCES gpuhaloexchange_impl.cu)
+endif()
+
+set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} ${DOMDEC_CUDA_SOURCES} PARENT_SCOPE)
 
 if (BUILD_TESTING)
     add_subdirectory(tests)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 5c76700e8a..f02b37a39c 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -55,6 +55,7 @@
 #include "gromacs/domdec/dlbtiming.h"
 #include "gromacs/domdec/domdec_network.h"
 #include "gromacs/domdec/ga2la.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/options.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/gmxlib/network.h"
@@ -3016,7 +3017,13 @@ getDDSettings(const gmx::MDLogger     &mdlog,
 
     ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
     ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
-    ddSettings.request1DAnd1Pulse  = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
+    // TODO GPU halo exchange requires a 1D single-pulse DD, and when
+    // it is properly integrated the hack with GMX_GPU_DD_COMMS should
+    // be removed.
+    ddSettings.request1DAnd1Pulse  = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) ||
+                                      (bool(getenv("GMX_GPU_DD_COMMS") != nullptr &&
+                                            GMX_THREAD_MPI &&
+                                            (GMX_GPU == GMX_GPU_CUDA))));
     ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
     ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
     ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h
index 88bc93be50..60bd04913d 100644
--- a/src/gromacs/domdec/domdec_struct.h
+++ b/src/gromacs/domdec/domdec_struct.h
@@ -74,6 +74,7 @@ namespace gmx
 {
 template <typename T> class HashedMap;
 class LocalAtomSetManager;
+class GpuHaloExchange;
 }
 
 typedef struct {
@@ -226,6 +227,9 @@ struct gmx_domdec_t { //NOLINT(clang-analyzer-optin.performance.Padding)
 
     /* gmx_pme_recv_f buffer */
     std::vector<gmx::RVec> pmeForceReceiveBuffer;
+
+    /* GPU halo exchange object */
+    std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
 };
 
 //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
new file mode 100644
index 0000000000..58f908b76a
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -0,0 +1,108 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declaration of GPU halo exchange.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ * \inlibraryapi
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_H
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/gmxmpi.h"
+
+struct gmx_domdec_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages GPU Halo Exchange object */
+class GpuHaloExchange
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * Halo exchange will be performed in \c streamNonLocal, and
+         * the main communicateHaloCoordinates method must be called
+         * before any subsequent operations that access non-local
+         * parts of the coordinate buffer (such as the non-local
+         * non-bonded kernels). It also must be called after the local
+         * coordinates buffer operations (where the coordinates are
+         * copied to the device and hence the \c
+         * coordinatesOnDeviceEvent is recorded).
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    streamNonLocal           non-local NB CUDA stream.
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        GpuHaloExchange(gmx_domdec_t *dd,
+                        MPI_Comm      mpi_comm_mysim,
+                        void         *streamNonLocal,
+                        void         *coordinatesOnDeviceEvent);
+        ~GpuHaloExchange();
+
+        /*! \brief
+         *
+         * Initialization for GPU halo exchange of coordinates buffer
+         * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(rvec *d_coordinateBuffer);
+
+
+        /*! \brief GPU halo exchange of coordinates buffer.
+         *
+         * Must be called after local setCoordinates (which records an
+         * event when the coordinate data has been copied to the
+         * device).
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} //namespace gmx
+
+#endif
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
new file mode 100755
index 0000000000..4b491eac36
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement Domdec CUDA interfaces for non-GPU builds.
+ *
+ * Currently, reports and exits if any of the interfaces are called.
+ * Needed to satisfy compiler on systems, where CUDA is not available.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub. */
+class GpuHaloExchange::Impl
+{
+};
+
+/*!\brief Constructor stub. */
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
+                                 MPI_Comm       /* mpi_comm_mysim */,
+                                 void         * /*streamNonLocal */,
+                                 void         * /*coordinatesOnDeviceEvent*/)
+    : impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+/*!\brief init halo exhange stub. */
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+/*!\brief apply X halo exchange stub. */
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
+}
+
+
+}      // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
new file mode 100644
index 0000000000..4e8ba35dd4
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -0,0 +1,344 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU halo exchange using CUDA.
+ *
+ *
+ * \author Alan Gray <alang@nvidia.com.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "gpuhaloexchange_impl.cuh"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "domdec_internal.h"
+
+namespace gmx
+{
+
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_threadsPerBlock = 256;
+
+template <bool usePBC>
+__global__ void packSendBufKernel(float3 * __restrict__       dataPacked,
+                                  const float3 * __restrict__ data,
+                                  const int * __restrict__    map,
+                                  const int                   mapSize,
+                                  const float3                coordinateShift)
+{
+    int           threadIndex          = blockIdx.x*blockDim.x+threadIdx.x;
+    float3       *gm_dataDest          = &dataPacked[threadIndex];
+    const float3 *gm_dataSrc           = &data[map[threadIndex]];
+
+    if (threadIndex < mapSize)
+    {
+        if (usePBC)
+        {
+            *gm_dataDest = *gm_dataSrc + coordinateShift;
+        }
+        else
+        {
+            *gm_dataDest = *gm_dataSrc;
+        }
+
+    }
+
+    return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
+{
+
+    d_x_ = d_coordinatesBuffer;
+
+    cudaStream_t                  stream            = nonLocalStream_;
+    int                           nzone             = 1;
+    const gmx_domdec_comm_t      &comm              = *dd_->comm;
+    const gmx_domdec_comm_dim_t  &cd                = comm.cd[0];
+    const gmx_domdec_ind_t       &ind               = cd.ind[0];
+    int                           newSize           = ind.nsend[nzone+1];
+
+    GMX_RELEASE_ASSERT(cd.numPulses() == 1, "Multiple pulses are not yet supported in GPU halo exchange");
+    GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
+
+    // reallocates only if needed
+    h_indexMap_.resize(newSize);
+    // reallocate on device only if needed
+    if (newSize > maxPackedBufferSize_)
+    {
+        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+        maxPackedBufferSize_ = newSize;
+    }
+
+    xSendSize_ = newSize;
+    MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0,
+                 &xRecvSize_, sizeof(int), MPI_BYTE, recvRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+    fSendSize_ = xRecvSize_;
+    fRecvSize_ = xSendSize_;
+
+    localOffset_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
+
+    GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
+    std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
+
+    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, GpuApiCallBehavior::Async, nullptr);
+
+    // This rank will push data to its neighbor, so needs to know
+    // the remote receive address and similarly send its receive
+    // address to other neighbour. We can do this here in reinit fn
+    // since the pointers will not change until the next NS step.
+
+    //Coordinates buffer:
+    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
+                 &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    //Force buffer:
+    recvPtr  = static_cast<void*> (d_recvBuf_);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0,
+                 &remoteFPtr_, sizeof(void*), MPI_BYTE, sendRankF_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+
+    return;
+}
+
+// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
+// event when the coordinate data has been copied to the device).
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+{
+
+    //ensure stream waits until coordinate data is available on device
+    coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+
+    // launch kernel to pack send buffer
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (xSendSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = nonLocalStream_;
+
+    const float3     *sendBuf  = d_sendBuf_;
+    const float3     *d_x      = d_x_;
+    const int        *indexMap = d_indexMap_;
+    const int         size     = xSendSize_;
+    // The coordinateShift changes between steps when we have
+    // performed a DD partition, or have updated the box e.g. when
+    // performing pressure coupling. So, for simplicity, the the box
+    // is used every step to pass the shift vector as an argument of
+    // the packing kernel.
+    //
+    // Because only one-dimensional DD is supported, the coordinate
+    // shift only needs to handle that dimension.
+    const int         dimensionIndex = dd_->dim[0];
+    const float3      coordinateShift {
+        box[dimensionIndex][XX], box[dimensionIndex][YY], box[dimensionIndex][ZZ]
+    };
+
+    // Avoid launching kernel when there is no work to do
+    if (size > 0)
+    {
+        auto              kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
+
+        const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap,
+                                                                   &size, &coordinateShift);
+
+        launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+    }
+
+    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+
+    return;
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
+                                                HaloQuantity haloQuantity)
+{
+
+    void * sendPtr;
+    int    sendSize;
+    void * remotePtr;
+    int    sendRank;
+    int    recvRank;
+    if (haloQuantity == HaloQuantity::HaloCoordinates)
+    {
+        sendPtr   = static_cast<void*> (d_sendBuf_);
+        sendSize  = xSendSize_;
+        remotePtr = remoteXPtr_;
+        sendRank  = sendRankX_;
+        recvRank  = recvRankX_;
+    }
+    else
+    {
+        sendPtr   = static_cast<void*> (&(d_ptr[localOffset_]));
+        sendSize  = fSendSize_;
+        remotePtr = remoteFPtr_;
+        sendRank  = sendRankF_;
+        recvRank  = recvRankF_;
+    }
+
+    communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
+                                                              int   sendSize,
+                                                              int   sendRank,
+                                                              void *remotePtr,
+                                                              int   recvRank)
+{
+
+    cudaError_t  stat;
+
+    // We asynchronously push data to remote rank. The remote
+    // destination pointer has already been set in the init fn.  We
+    // don't need to worry about overwriting data the remote ranks
+    // still needs since the halo exchange is just done once per
+    // timestep, for each of X and F.
+
+    // send data to neighbor, if any data exists to send
+    if (sendSize > 0)
+    {
+        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+        CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
+    }
+
+    //ensure pushed data has arrived before remote rank progresses
+    // This rank records an event and sends it to the remote rank which has just been pushed data.
+    // This rank recieves event from remote rank which has pushed data here, and enqueues that event to
+    // its stream.
+    GpuEventSynchronizer *haloDataTransferRemote;
+
+    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+
+    MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
+                 &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+
+}
+
+/*! \brief Create Domdec GPU object */
+GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
+                            MPI_Comm      mpi_comm_mysim,
+                            void        * nonLocalStream,
+                            void        * coordinatesOnDeviceEvent)
+    : dd_(dd),
+      sendRankX_(dd->neighbor[0][1]),
+      recvRankX_(dd->neighbor[0][0]),
+      sendRankF_(dd->neighbor[0][0]),
+      recvRankF_(dd->neighbor[0][1]),
+      usePBC_(dd->ci[dd->dim[0]] == 0),
+      haloDataTransferLaunched_(new GpuEventSynchronizer()),
+      mpi_comm_mysim_(mpi_comm_mysim),
+      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
+      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+{
+
+    GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
+
+    if (dd->ndim > 1)
+    {
+        gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange");
+    }
+
+    if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
+    {
+        gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
+    }
+
+    changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
+
+    allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+
+}
+
+GpuHaloExchange::Impl::~Impl()
+{
+    freeDeviceBuffer(&d_indexMap_);
+    freeDeviceBuffer(&d_sendBuf_);
+    freeDeviceBuffer(&d_recvBuf_);
+    freeDeviceBuffer(&d_fShift_);
+    delete haloDataTransferLaunched_;
+}
+
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
+                                 MPI_Comm      mpi_comm_mysim,
+                                 void         *nonLocalStream,
+                                 void         *coordinatesOnDeviceEvent)
+    : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+{
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer)
+{
+    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+}
+
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+{
+    impl_->communicateHaloCoordinates(box);
+}
+
+} //namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
new file mode 100644
index 0000000000..bd125654ee
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -0,0 +1,180 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation of GPU Halo Exchange.
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/hostallocator.h"
+
+namespace gmx
+{
+
+/*! \brief switch for whether coordinates or force halo is being applied */
+enum class HaloQuantity
+{
+    HaloCoordinates, HaloForces
+};
+
+/*! \internal \brief Class with interfaces and data for GPU Halo Exchange */
+class GpuHaloExchange::Impl
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    nonLocalStream           non-local NB CUDA stream
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        Impl(gmx_domdec_t *dd,
+             MPI_Comm mpi_comm_mysim,
+             void *nonLocalStream,
+             void *coordinatesOnDeviceEvent);
+        ~Impl();
+
+        /*! \brief
+         * (Re-) Initialization for GPU halo exchange
+         * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(float3 *d_coordinatesBuffer);
+
+
+        /*! \brief
+         * GPU halo exchange of coordinates buffer
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+
+        /*! \brief Data transfer wrapper for GPU halo exchange
+         * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
+         * \param [in] haloQuantity  switch on whether X or F halo exchange is being performed
+         */
+        void communicateHaloData(float3      *d_ptr,
+                                 HaloQuantity haloQuantity);
+
+        /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
+         * \param [inout] sendPtr    address to send data from
+         * \param [in] sendSize      number of atoms to be sent
+         * \param [in] sendRank      rank to send data to
+         * \param [inout] remotePtr  remote address to recv data
+         * \param [in] recvRank      rank to recv data from
+         */
+        void communicateHaloDataWithCudaDirect(void *sendPtr,
+                                               int   sendSize,
+                                               int   sendRank,
+                                               void* remotePtr,
+                                               int   recvRank);
+
+        //! Domain decomposition object
+        gmx_domdec_t               *dd_                       = nullptr;
+        //! map of indices to be sent from this rank
+        gmx::HostVector<int>        h_indexMap_;
+        //! device copy of index map
+        int                        *d_indexMap_               = nullptr;
+        //! number of elements in index map array
+        int                         indexMapSize_             = -1;
+        //! number of elements allocated in index map array
+        int                         indexMapSizeAlloc_        = -1;
+        //! device buffer for sending packed data
+        float3                     *d_sendBuf_ = nullptr;
+        //! number of atoms in sendbuf array
+        int                         sendBufSize_              = -1;
+        //! number of atoms allocated in sendbuf array
+        int                         sendBufSizeAlloc_         = -1;
+        //! device buffer for receiving packed data
+        float3                     *d_recvBuf_                = nullptr;
+        //! maximum size of packed buffer
+        int                         maxPackedBufferSize_      = 0;
+        //! number of atoms in recvbuf array
+        int                         recvBufSize_              = -1;
+        //! number of atoms allocated in recvbuf array
+        int                         recvBufSizeAlloc_         = -1;
+        //! rank to send data to for X
+        int                         sendRankX_                = 0;
+        //! rank to recv data from for X
+        int                         recvRankX_                = 0;
+        //! rank to send data to for F
+        int                         sendRankF_                = 0;
+        //! rank to recv data from for F
+        int                         recvRankF_                = 0;
+        //! send copy size from this rank for X
+        int                         xSendSize_                = 0;
+        //! recv copy size to this rank for X
+        int                         xRecvSize_                = 0;
+        //! send copy size from this rank for F
+        int                         fSendSize_                = 0;
+        //! recv copy size to this rank for F
+        int                         fRecvSize_                = 0;
+        //! offset of local halo region
+        int                         localOffset_              = 0;
+        //! remote GPU coordinates buffer pointer for pushing data
+        void                       *remoteXPtr_               = 0;
+        //! remote GPU force buffer pointer for pushing data
+        void                       *remoteFPtr_               = 0;
+        //! Periodic Boundary Conditions for this rank
+        bool                        usePBC_                   = false;
+        //! force shift buffer on device
+        float3 *                    d_fShift_                 = nullptr;
+        //! Event triggered when halo transfer has been launched with direct CUD memory copy
+        GpuEventSynchronizer       *haloDataTransferLaunched_ = nullptr;
+        //! MPI communicator used for simulation
+        MPI_Comm                    mpi_comm_mysim_;
+        //! CUDA stream for non-local non-bonded calculations
+        cudaStream_t                nonLocalStream_           = nullptr;
+        //! Event triggered when coordinates have been copied to device
+        GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
+        //! full coordinates buffer in GPU memory
+        float3                     *d_x_                      = nullptr;
+
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index c540fa86f6..e2fffacd83 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -49,6 +49,7 @@
 #include "gromacs/domdec/dlbtiming.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
@@ -125,6 +126,10 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP
 // TODO eventially tie this in with other existing GPU flags.
 static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
 
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
 static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
 {
     const int      end = forceToAdd.size();
@@ -1180,6 +1185,12 @@ void do_force(FILE                                     *fplog,
         launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
+    const bool            ddUsesGpuDirectCommunication
+        = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr);
+    gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr;
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr,
+               "Must have valid gpuHaloExchange when doing halo exchange on the GPU");
+
     /* Communicate coordinates and sum dipole if necessary +
        do non-local pair search */
     if (havePPDomainDecomposition(cr))
@@ -1196,15 +1207,36 @@ void do_force(FILE                                     *fplog,
             nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
+            if (ddUsesGpuDirectCommunication)
+            {
+                rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
+                gpuHaloExchange->reinitHalo(d_x);
+            }
         }
         else
         {
-            dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            if (ddUsesGpuDirectCommunication)
+            {
+                // The following must be called after local setCoordinates (which records an event
+                // when the coordinate data has been copied to the device).
+                gpuHaloExchange->communicateHaloCoordinates(box);
+
+                // TODO Force flags should include haveFreeEnergyWork for this domain
+                if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+                {
+                    //non-local part of coordinate buffer must be copied back to host for CPU work
+                    nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+                }
+            }
+            else
+            {
+                dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            }
 
             if (useGpuXBufOps == BufferOpsUseGpu::True)
             {
                 // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
-                if (!useGpuPme)
+                if (!useGpuPme && !ddUsesGpuDirectCommunication)
                 {
                     nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
                                               x.unpaddedArrayRef());
@@ -1403,6 +1435,13 @@ void do_force(FILE                                     *fplog,
         update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
     }
 
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    if (ddUsesGpuDirectCommunication &&
+        (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+    {
+        /* Wait for non-local coordinate data to be copied from device */
+        nbv->wait_nonlocal_x_copy_D2H_done();
+    }
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fr, inputrec, &(top->idef),
                       cr, ms, nrnb, wcycle, mdatoms,
@@ -1419,8 +1458,10 @@ void do_force(FILE                                     *fplog,
                          forceFlags, &forceOut.forceWithVirial(), enerd,
                          ed, forceFlags.doNeighborSearch);
 
-    bool                   useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-    bool                   haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuFPmeReduction);
+    const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    const bool haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork ||
+                                      useCpuFPmeReduction || (fr->efep != efepNO));
 
     // Will store the amount of cycles spent waiting for the GPU that
     // will be later used in the DLB accounting.
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 5caa3caee5..68348803b2 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -59,6 +59,7 @@
 #include "gromacs/commandline/filenm.h"
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/localatomsetmanager.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/ewald/ewald_utils.h"
@@ -161,18 +162,25 @@
 namespace gmx
 {
 
-/*! \brief Log if development feature flags are encountered
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
+/*! \brief Manage any development feature flag variables encountered
  *
- * The use of dev features indicated by environment variables is logged
- * in order to ensure that runs with such featrues enabled can be identified
- * from their log and standard output.
+ * The use of dev features indicated by environment variables is
+ * logged in order to ensure that runs with such featrues enabled can
+ * be identified from their log and standard output. Any cross
+ * dependencies are also checked, and if unsatisified, a fatal error
+ * issued.
  *
  * \param[in]  mdlog        Logger object.
  */
-static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
 {
     const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
     const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
+    const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
 
     if (enableGpuBufOps)
     {
@@ -180,11 +188,22 @@ static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
                 "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
     }
 
+    if (enableGpuHaloExchange)
+    {
+        if (!enableGpuBufOps)
+        {
+            gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
+        }
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
+    }
+
     if (useGpuUpdateConstrain)
     {
         GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                 "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
     }
+
 }
 
 /*! \brief Barrier for safe simultaneous thread access to mdrunner data
@@ -654,7 +673,7 @@ int Mdrunner::mdrunner()
     gmx::MDLogger    mdlog(logOwner.logger());
 
     // report any development features that may be enabled by environment variables
-    reportDevelopmentFeatures(mdlog);
+    manageDevelopmentFeatures(mdlog);
 
     // With thread-MPI, the communicator changes after threads are
     // launched, so this is rebuilt for the master rank at that
@@ -1316,6 +1335,15 @@ int Mdrunner::mdrunner()
                       pforce,
                       wcycle);
 
+        // TODO Move this to happen during domain decomposition setup,
+        // once stream and event handling works well with that.
+        if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
+        {
+            void *stream                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
+            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+        }
+
         /* Initialize the mdAtoms structure.
          * mdAtoms is not filled with atom data,
          * as this can not be done now with domain decomposition.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 5c72c7faad..9715d2985f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -111,7 +111,6 @@
 #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu"
 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 
-
 namespace Nbnxm
 {
 
@@ -777,6 +776,11 @@ void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid               &grid,
     copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
                        stream, GpuApiCallBehavior::Async, nullptr);
 
+    if (interactionLoc == Nbnxm::InteractionLocality::Local)
+    {
+        nb->xAvailableOnDevice->markEvent(stream);
+    }
+
     if (bDoTime)
     {
         nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
@@ -1011,6 +1015,47 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
     return;
 }
 
+void nbnxn_launch_copy_x_from_gpu(const AtomLocality               atomLocality,
+                                  const Nbnxm::GridSet            &gridSet,
+                                  gmx_nbnxn_gpu_t                 *nb,
+                                  rvec                            *x)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(x,  "Need a valid x pointer");
+
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    bool                      bDoTime = nb->bDoTime;
+    cu_timers_t              *t       = nb->timers;
+    int                       atomStart, nAtoms;
+
+    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+    }
+
+    GMX_ASSERT(nb->xrvec,  "Need a valid nb->xrvec pointer");
+    rvec       *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
+    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
+    copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+                         stream, GpuApiCallBehavior::Async, stream);
+
+    if (atomLocality == AtomLocality::NonLocal)
+    {
+        nb->xNonLocalCopyD2HDone->markEvent(stream);
+    }
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+    }
+
+    return;
+}
+
 void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
                                         gmx_nbnxn_gpu_t                   *nb)
 {
@@ -1024,4 +1069,19 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomL
 
 }
 
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return static_cast<void *> (gpu_nbv->xrvec);
+}
+
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
+{
+    return static_cast<void*> (nb->xAvailableOnDevice);
+}
+
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t   *nb)
+{
+    nb->xNonLocalCopyD2HDone->waitForEvent();
+}
+
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 551f5c2226..bdeefcbb66 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,6 +51,7 @@
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/math/vectypes.h"
@@ -481,6 +482,9 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
     stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
     CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
 
+    nb->xAvailableOnDevice   = new GpuEventSynchronizer();
+    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
     /* WARNING: CUDA timings are incorrect with multiple streams.
      *          This is the main reason why they are disabled by default.
      */
@@ -899,6 +903,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
         const int           atomIndicesSize   = gridSet.atomIndices().size();
         const int          *cxy_na            = grid.cxy_na().data();
         const int          *cxy_ind           = grid.cxy_ind().data();
+        // TODO Should be done once per gridset
         const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
 
         reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index c18cd3f9a2..ec4b2b8d33 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -206,6 +206,8 @@ using cu_plist_t = Nbnxm::gpu_plist;
  */
 typedef struct Nbnxm::gpu_timers_t cu_timers_t;
 
+class GpuEventSynchronizer;
+
 /** \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
@@ -278,6 +280,16 @@ struct gmx_nbnxn_cuda_t
     gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
 
 
+    GpuEventSynchronizer *xAvailableOnDevice;   /**< event triggered when
+                                                   coordinate buffer has been
+                                                   copied to device by PP task and
+                                                   any dependent task (e.g. transfer of coordinates
+                                                   to the PME rank's GPU) can proceed. */
+
+    GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
+                                                   non-local coordinate buffer has been
+                                                   copied from device to host*/
+
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 68eac205a4..031033dea9 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -302,9 +302,32 @@ void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocali
                                  f);
 }
 
+void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
+{
+    nbnxn_launch_copy_x_from_gpu(locality,
+                                 pairSearch_->gridSet(),
+                                 gpu_nbv,
+                                 x);
+}
+
 void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
 {
     nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
 }
 
+void* nonbonded_verlet_t::get_gpu_xrvec()
+{
+    return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
+}
+
+void* nonbonded_verlet_t::get_x_on_device_event()
+{
+    return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
+}
+
+void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
+{
+    Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
+}
+
 /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index db550859df..7a27a5cb4e 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -388,12 +388,24 @@ struct nonbonded_verlet_t
         /*! \brief D2H transfer of force buffer*/
         void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
 
+        /*! \brief D2H transfer of coordinate buffer*/
+        void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
         /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
          *
          * FIXME: need more details: when should be called / after which operation, etc.
          */
         void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
 
+        /*! \brief return GPU pointer to x in rvec format */
+        void* get_gpu_xrvec();
+
+        /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
+        void* get_x_on_device_event();
+
+        /*! \brief Wait for non-local copy of coordinate buffer from device to host */
+        void wait_nonlocal_x_copy_D2H_done();
+
         //! Return the kernel setup
         const Nbnxm::KernelSetup &kernelSetup() const
         {
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 3794c98320..743bcffa20 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -370,12 +370,46 @@ void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocali
                                   gmx_nbnxn_gpu_t         gmx_unused *nb,
                                   rvec                    gmx_unused *f) CUDA_FUNC_TERM;
 
+/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
+ * \param[in]  atomLocality  Locality for data trasnfer
+ * \param[in]  gridSet       The Grid Set data object
+ * \param[in]  nb            The nonbonded data GPU structure
+ * \param[out] x             Coordinate buffer on CPU
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
+                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
+
 /*! \brief Wait for GPU stream to complete */
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
                                         gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
 
+/*! \brief sync CPU thread on coordinate copy to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
-}     // namespace Nbnxm
+/*! \brief return pointer to event recorded when coordinates have been copied to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief return GPU pointer to x in rvec format
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Wait for non-local copy of coordinate buffer from device to host
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
 
+} // namespace Nbnxm
 #endif