GPU halo exchange

author Alan Gray <alang@nvidia.com>

Tue, 26 Feb 2019 12:43:22 +0000 (04:43 -0800)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 16 Sep 2019 13:12:49 +0000 (15:12 +0200)
author Alan Gray <alang@nvidia.com>
Tue, 26 Feb 2019 12:43:22 +0000 (04:43 -0800)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 16 Sep 2019 13:12:49 +0000 (15:12 +0200)
diff --git a/admin/builds/gpucomm-matrix.txt b/admin/builds/gpucomm-matrix.txt

index a189c38786e6204673e59c3610d016548a3b77c8..ef7895fe30e6fedff9ebcff4367ec8d53943148f 100644 (file)
--- a/admin/builds/gpucomm-matrix.txt
+++ b/admin/builds/gpucomm-matrix.txt
@@ -13,14 +13,16 @@
  
  # Test an older version of CUDA
  # Test MPI with CUDA
-# Test MPMD PME with thread-MPI (library MPI not supported initially)
+# Test multiple PP ranks with separate PME rank
+# Test with thread-MPI (library MPI not supported initially)
  # Test GPU comm features in the above combination
-gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=2 thread-mpi openmp gpucomm
+gcc-5 gpuhw=nvidia cuda-9.0 npme=1 nranks=3 thread-mpi openmp gpucomm
  
  # Test newest gcc supported by newest CUDA at time of release
+# Test multiple PP ranks without separate PME rank
  # Test thread-MPI with CUDA
  # Test GPU comm features in the above combination
-gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
+gcc-8 gpuhw=nvidia nranks=2 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpucomm
  
  # Test non-default use of mdrun -gpu_id
  # Test GPU-sharing among 4 PP ranks
@@ -36,3 +38,6 @@ gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpucomm
  # Test OpenCL build with gpudev features
  # Test GPU comm on the OpenCL path where it is unsupported
  clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpucomm
+
+# Test a non-GPU build with the gpucomm optiom set
+gcc-8 double x11 no-tng fftpack simd=sse4.1 gpucomm
diff --git a/src/gromacs/domdec/CMakeLists.txt b/src/gromacs/domdec/CMakeLists.txt

index 1a17c41300fb5f0cfb1eaf3f5cf08da873a0a568..c4fca1e308a2c540ee356a939067a407ad91d719 100644 (file)
--- a/src/gromacs/domdec/CMakeLists.txt
+++ b/src/gromacs/domdec/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2014,2018, by the GROMACS development team, led by
+# Copyright (c) 2014,2018,2019, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -33,7 +33,12 @@
  # the research papers on the package. Check out http://www.gromacs.org.
  
  file(GLOB DOMDEC_SOURCES *.cpp)
-set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} PARENT_SCOPE)
+
+if(GMX_USE_CUDA)
+  file(GLOB DOMDEC_CUDA_SOURCES gpuhaloexchange_impl.cu)
+endif()
+
+set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${DOMDEC_SOURCES} ${DOMDEC_CUDA_SOURCES} PARENT_SCOPE)
  
  if (BUILD_TESTING)
      add_subdirectory(tests)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 5c76700e8a6dc7c358174b48bd706758ad1d97b6..f02b37a39c652fcfee12a09df8d8493f3105bf9b 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -55,6 +55,7 @@
  #include "gromacs/domdec/dlbtiming.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/ga2la.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/options.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/gmxlib/network.h"
@@ -3016,7 +3017,13 @@ getDDSettings(const gmx::MDLogger     &mdlog,
  
      ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
      ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
-    ddSettings.request1DAnd1Pulse  = bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0));
+    // TODO GPU halo exchange requires a 1D single-pulse DD, and when
+    // it is properly integrated the hack with GMX_GPU_DD_COMMS should
+    // be removed.
+    ddSettings.request1DAnd1Pulse  = (bool(dd_getenv(mdlog, "GMX_DD_1D_1PULSE", 0)) ||
+                                      (bool(getenv("GMX_GPU_DD_COMMS") != nullptr &&
+                                            GMX_THREAD_MPI &&
+                                            (GMX_GPU == GMX_GPU_CUDA))));
      ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
      ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
      ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h

index 88bc93be50d35f026301a1b0c507c73dff1987c4..60bd04913d39482cbe185ec374db263f475d45f6 100644 (file)
--- a/src/gromacs/domdec/domdec_struct.h
+++ b/src/gromacs/domdec/domdec_struct.h
@@ -74,6 +74,7 @@ namespace gmx
  {
  template <typename T> class HashedMap;
  class LocalAtomSetManager;
+class GpuHaloExchange;
  }
  
  typedef struct {
@@ -226,6 +227,9 @@ struct gmx_domdec_t { //NOLINT(clang-analyzer-optin.performance.Padding)
  
      /* gmx_pme_recv_f buffer */
      std::vector<gmx::RVec> pmeForceReceiveBuffer;
+
+    /* GPU halo exchange object */
+    std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
  };
  
  //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

new file mode 100644 (file)

index 0000000..58f908b
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -0,0 +1,108 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declaration of GPU halo exchange.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ * \inlibraryapi
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_H
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/classhelpers.h"
+#include "gromacs/utility/gmxmpi.h"
+
+struct gmx_domdec_t;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages GPU Halo Exchange object */
+class GpuHaloExchange
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * Halo exchange will be performed in \c streamNonLocal, and
+         * the main communicateHaloCoordinates method must be called
+         * before any subsequent operations that access non-local
+         * parts of the coordinate buffer (such as the non-local
+         * non-bonded kernels). It also must be called after the local
+         * coordinates buffer operations (where the coordinates are
+         * copied to the device and hence the \c
+         * coordinatesOnDeviceEvent is recorded).
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    streamNonLocal           non-local NB CUDA stream.
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        GpuHaloExchange(gmx_domdec_t *dd,
+                        MPI_Comm      mpi_comm_mysim,
+                        void         *streamNonLocal,
+                        void         *coordinatesOnDeviceEvent);
+        ~GpuHaloExchange();
+
+        /*! \brief
+         *
+         * Initialization for GPU halo exchange of coordinates buffer
+         * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(rvec *d_coordinateBuffer);
+
+
+        /*! \brief GPU halo exchange of coordinates buffer.
+         *
+         * Must be called after local setCoordinates (which records an
+         * event when the coordinate data has been copied to the
+         * device).
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+} //namespace gmx
+
+#endif
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

new file mode 100755 (executable)

index 0000000..4b491ea
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement Domdec CUDA interfaces for non-GPU builds.
+ *
+ * Currently, reports and exits if any of the interfaces are called.
+ * Needed to satisfy compiler on systems, where CUDA is not available.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+
+#if GMX_GPU != GMX_GPU_CUDA
+
+namespace gmx
+{
+
+/*!\brief Impl class stub. */
+class GpuHaloExchange::Impl
+{
+};
+
+/*!\brief Constructor stub. */
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
+                                 MPI_Comm       /* mpi_comm_mysim */,
+                                 void         * /*streamNonLocal */,
+                                 void         * /*coordinatesOnDeviceEvent*/)
+    : impl_(nullptr)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+/*!\brief init halo exhange stub. */
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
+
+/*!\brief apply X halo exchange stub. */
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
+}
+
+
+}      // namespace gmx
+
+#endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

new file mode 100644 (file)

index 0000000..4e8ba35
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -0,0 +1,344 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU halo exchange using CUDA.
+ *
+ *
+ * \author Alan Gray <alang@nvidia.com.com>
+ *
+ * \ingroup module_domdec
+ */
+#include "gmxpre.h"
+
+#include "gpuhaloexchange_impl.cuh"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "domdec_internal.h"
+
+namespace gmx
+{
+
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_threadsPerBlock = 256;
+
+template <bool usePBC>
+__global__ void packSendBufKernel(float3 * __restrict__       dataPacked,
+                                  const float3 * __restrict__ data,
+                                  const int * __restrict__    map,
+                                  const int                   mapSize,
+                                  const float3                coordinateShift)
+{
+    int           threadIndex          = blockIdx.x*blockDim.x+threadIdx.x;
+    float3       *gm_dataDest          = &dataPacked[threadIndex];
+    const float3 *gm_dataSrc           = &data[map[threadIndex]];
+
+    if (threadIndex < mapSize)
+    {
+        if (usePBC)
+        {
+            *gm_dataDest = *gm_dataSrc + coordinateShift;
+        }
+        else
+        {
+            *gm_dataDest = *gm_dataSrc;
+        }
+
+    }
+
+    return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
+{
+
+    d_x_ = d_coordinatesBuffer;
+
+    cudaStream_t                  stream            = nonLocalStream_;
+    int                           nzone             = 1;
+    const gmx_domdec_comm_t      &comm              = *dd_->comm;
+    const gmx_domdec_comm_dim_t  &cd                = comm.cd[0];
+    const gmx_domdec_ind_t       &ind               = cd.ind[0];
+    int                           newSize           = ind.nsend[nzone+1];
+
+    GMX_RELEASE_ASSERT(cd.numPulses() == 1, "Multiple pulses are not yet supported in GPU halo exchange");
+    GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
+
+    // reallocates only if needed
+    h_indexMap_.resize(newSize);
+    // reallocate on device only if needed
+    if (newSize > maxPackedBufferSize_)
+    {
+        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+        maxPackedBufferSize_ = newSize;
+    }
+
+    xSendSize_ = newSize;
+    MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0,
+                 &xRecvSize_, sizeof(int), MPI_BYTE, recvRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+    fSendSize_ = xRecvSize_;
+    fRecvSize_ = xSendSize_;
+
+    localOffset_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
+
+    GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
+    std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
+
+    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, GpuApiCallBehavior::Async, nullptr);
+
+    // This rank will push data to its neighbor, so needs to know
+    // the remote receive address and similarly send its receive
+    // address to other neighbour. We can do this here in reinit fn
+    // since the pointers will not change until the next NS step.
+
+    //Coordinates buffer:
+    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
+                 &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    //Force buffer:
+    recvPtr  = static_cast<void*> (d_recvBuf_);
+    MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0,
+                 &remoteFPtr_, sizeof(void*), MPI_BYTE, sendRankF_, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+
+    return;
+}
+
+// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
+// event when the coordinate data has been copied to the device).
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+{
+
+    //ensure stream waits until coordinate data is available on device
+    coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+
+    // launch kernel to pack send buffer
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (xSendSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = nonLocalStream_;
+
+    const float3     *sendBuf  = d_sendBuf_;
+    const float3     *d_x      = d_x_;
+    const int        *indexMap = d_indexMap_;
+    const int         size     = xSendSize_;
+    // The coordinateShift changes between steps when we have
+    // performed a DD partition, or have updated the box e.g. when
+    // performing pressure coupling. So, for simplicity, the the box
+    // is used every step to pass the shift vector as an argument of
+    // the packing kernel.
+    //
+    // Because only one-dimensional DD is supported, the coordinate
+    // shift only needs to handle that dimension.
+    const int         dimensionIndex = dd_->dim[0];
+    const float3      coordinateShift {
+        box[dimensionIndex][XX], box[dimensionIndex][YY], box[dimensionIndex][ZZ]
+    };
+
+    // Avoid launching kernel when there is no work to do
+    if (size > 0)
+    {
+        auto              kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
+
+        const auto        kernelArgs   = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap,
+                                                                   &size, &coordinateShift);
+
+        launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+    }
+
+    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+
+    return;
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
+                                                HaloQuantity haloQuantity)
+{
+
+    void * sendPtr;
+    int    sendSize;
+    void * remotePtr;
+    int    sendRank;
+    int    recvRank;
+    if (haloQuantity == HaloQuantity::HaloCoordinates)
+    {
+        sendPtr   = static_cast<void*> (d_sendBuf_);
+        sendSize  = xSendSize_;
+        remotePtr = remoteXPtr_;
+        sendRank  = sendRankX_;
+        recvRank  = recvRankX_;
+    }
+    else
+    {
+        sendPtr   = static_cast<void*> (&(d_ptr[localOffset_]));
+        sendSize  = fSendSize_;
+        remotePtr = remoteFPtr_;
+        sendRank  = sendRankF_;
+        recvRank  = recvRankF_;
+    }
+
+    communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
+}
+
+
+void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
+                                                              int   sendSize,
+                                                              int   sendRank,
+                                                              void *remotePtr,
+                                                              int   recvRank)
+{
+
+    cudaError_t  stat;
+
+    // We asynchronously push data to remote rank. The remote
+    // destination pointer has already been set in the init fn.  We
+    // don't need to worry about overwriting data the remote ranks
+    // still needs since the halo exchange is just done once per
+    // timestep, for each of X and F.
+
+    // send data to neighbor, if any data exists to send
+    if (sendSize > 0)
+    {
+        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+        CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
+    }
+
+    //ensure pushed data has arrived before remote rank progresses
+    // This rank records an event and sends it to the remote rank which has just been pushed data.
+    // This rank recieves event from remote rank which has pushed data here, and enqueues that event to
+    // its stream.
+    GpuEventSynchronizer *haloDataTransferRemote;
+
+    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+
+    MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
+                 &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
+                 mpi_comm_mysim_, MPI_STATUS_IGNORE);
+
+    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+
+}
+
+/*! \brief Create Domdec GPU object */
+GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
+                            MPI_Comm      mpi_comm_mysim,
+                            void        * nonLocalStream,
+                            void        * coordinatesOnDeviceEvent)
+    : dd_(dd),
+      sendRankX_(dd->neighbor[0][1]),
+      recvRankX_(dd->neighbor[0][0]),
+      sendRankF_(dd->neighbor[0][0]),
+      recvRankF_(dd->neighbor[0][1]),
+      usePBC_(dd->ci[dd->dim[0]] == 0),
+      haloDataTransferLaunched_(new GpuEventSynchronizer()),
+      mpi_comm_mysim_(mpi_comm_mysim),
+      nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
+      coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+{
+
+    GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
+
+    if (dd->ndim > 1)
+    {
+        gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange");
+    }
+
+    if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
+    {
+        gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
+    }
+
+    changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
+
+    allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+
+}
+
+GpuHaloExchange::Impl::~Impl()
+{
+    freeDeviceBuffer(&d_indexMap_);
+    freeDeviceBuffer(&d_sendBuf_);
+    freeDeviceBuffer(&d_recvBuf_);
+    freeDeviceBuffer(&d_fShift_);
+    delete haloDataTransferLaunched_;
+}
+
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
+                                 MPI_Comm      mpi_comm_mysim,
+                                 void         *nonLocalStream,
+                                 void         *coordinatesOnDeviceEvent)
+    : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+{
+}
+
+GpuHaloExchange::~GpuHaloExchange() = default;
+
+void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer)
+{
+    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+}
+
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+{
+    impl_->communicateHaloCoordinates(box);
+}
+
+} //namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

new file mode 100644 (file)

index 0000000..bd12565
--- /dev/null
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -0,0 +1,180 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares CUDA implementation of GPU Halo Exchange.
+ *
+ * This header file is needed to include from both the device-side
+ * kernels file, and the host-side management code.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_domdec
+ */
+#ifndef GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+#define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
+
+#include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/hostallocator.h"
+
+namespace gmx
+{
+
+/*! \brief switch for whether coordinates or force halo is being applied */
+enum class HaloQuantity
+{
+    HaloCoordinates, HaloForces
+};
+
+/*! \internal \brief Class with interfaces and data for GPU Halo Exchange */
+class GpuHaloExchange::Impl
+{
+
+    public:
+        /*! \brief Creates GPU Halo Exchange object.
+         *
+         * \param [inout] dd                       domdec structure
+         * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    nonLocalStream           non-local NB CUDA stream
+         * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
+         */
+        Impl(gmx_domdec_t *dd,
+             MPI_Comm mpi_comm_mysim,
+             void *nonLocalStream,
+             void *coordinatesOnDeviceEvent);
+        ~Impl();
+
+        /*! \brief
+         * (Re-) Initialization for GPU halo exchange
+         * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+         */
+        void reinitHalo(float3 *d_coordinatesBuffer);
+
+
+        /*! \brief
+         * GPU halo exchange of coordinates buffer
+         * \param [in] box  Coordinate box (from which shifts will be constructed)
+         */
+        void communicateHaloCoordinates(const matrix box);
+
+    private:
+
+        /*! \brief Data transfer wrapper for GPU halo exchange
+         * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
+         * \param [in] haloQuantity  switch on whether X or F halo exchange is being performed
+         */
+        void communicateHaloData(float3      *d_ptr,
+                                 HaloQuantity haloQuantity);
+
+        /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
+         * \param [inout] sendPtr    address to send data from
+         * \param [in] sendSize      number of atoms to be sent
+         * \param [in] sendRank      rank to send data to
+         * \param [inout] remotePtr  remote address to recv data
+         * \param [in] recvRank      rank to recv data from
+         */
+        void communicateHaloDataWithCudaDirect(void *sendPtr,
+                                               int   sendSize,
+                                               int   sendRank,
+                                               void* remotePtr,
+                                               int   recvRank);
+
+        //! Domain decomposition object
+        gmx_domdec_t               *dd_                       = nullptr;
+        //! map of indices to be sent from this rank
+        gmx::HostVector<int>        h_indexMap_;
+        //! device copy of index map
+        int                        *d_indexMap_               = nullptr;
+        //! number of elements in index map array
+        int                         indexMapSize_             = -1;
+        //! number of elements allocated in index map array
+        int                         indexMapSizeAlloc_        = -1;
+        //! device buffer for sending packed data
+        float3                     *d_sendBuf_ = nullptr;
+        //! number of atoms in sendbuf array
+        int                         sendBufSize_              = -1;
+        //! number of atoms allocated in sendbuf array
+        int                         sendBufSizeAlloc_         = -1;
+        //! device buffer for receiving packed data
+        float3                     *d_recvBuf_                = nullptr;
+        //! maximum size of packed buffer
+        int                         maxPackedBufferSize_      = 0;
+        //! number of atoms in recvbuf array
+        int                         recvBufSize_              = -1;
+        //! number of atoms allocated in recvbuf array
+        int                         recvBufSizeAlloc_         = -1;
+        //! rank to send data to for X
+        int                         sendRankX_                = 0;
+        //! rank to recv data from for X
+        int                         recvRankX_                = 0;
+        //! rank to send data to for F
+        int                         sendRankF_                = 0;
+        //! rank to recv data from for F
+        int                         recvRankF_                = 0;
+        //! send copy size from this rank for X
+        int                         xSendSize_                = 0;
+        //! recv copy size to this rank for X
+        int                         xRecvSize_                = 0;
+        //! send copy size from this rank for F
+        int                         fSendSize_                = 0;
+        //! recv copy size to this rank for F
+        int                         fRecvSize_                = 0;
+        //! offset of local halo region
+        int                         localOffset_              = 0;
+        //! remote GPU coordinates buffer pointer for pushing data
+        void                       *remoteXPtr_               = 0;
+        //! remote GPU force buffer pointer for pushing data
+        void                       *remoteFPtr_               = 0;
+        //! Periodic Boundary Conditions for this rank
+        bool                        usePBC_                   = false;
+        //! force shift buffer on device
+        float3 *                    d_fShift_                 = nullptr;
+        //! Event triggered when halo transfer has been launched with direct CUD memory copy
+        GpuEventSynchronizer       *haloDataTransferLaunched_ = nullptr;
+        //! MPI communicator used for simulation
+        MPI_Comm                    mpi_comm_mysim_;
+        //! CUDA stream for non-local non-bonded calculations
+        cudaStream_t                nonLocalStream_           = nullptr;
+        //! Event triggered when coordinates have been copied to device
+        GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
+        //! full coordinates buffer in GPU memory
+        float3                     *d_x_                      = nullptr;
+
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index c540fa86f6d2d390ceec96aa0d212649b46daf5b..e2fffacd83c6c0366c9d9e6752417305940f9203 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -49,6 +49,7 @@
  #include "gromacs/domdec/dlbtiming.h"
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
@@ -125,6 +126,10 @@ static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GP
  // TODO eventially tie this in with other existing GPU flags.
  static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
  
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
  static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
  {
      const int      end = forceToAdd.size();
@@ -1180,6 +1185,12 @@ void do_force(FILE                                     *fplog,
          launchPmeGpuFftAndGather(fr->pmedata, wcycle);
      }
  
+    const bool            ddUsesGpuDirectCommunication
+        = c_enableGpuHaloExchange && c_enableGpuBufOps && bUseGPU && havePPDomainDecomposition(cr);
+    gmx::GpuHaloExchange *gpuHaloExchange = ddUsesGpuDirectCommunication ? cr->dd->gpuHaloExchange.get() : nullptr;
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || gpuHaloExchange != nullptr,
+               "Must have valid gpuHaloExchange when doing halo exchange on the GPU");
+
      /* Communicate coordinates and sum dipole if necessary +
         do non-local pair search */
      if (havePPDomainDecomposition(cr))
@@ -1196,15 +1207,36 @@ void do_force(FILE                                     *fplog,
              nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
+            if (ddUsesGpuDirectCommunication)
+            {
+                rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
+                gpuHaloExchange->reinitHalo(d_x);
+            }
          }
          else
          {
-            dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            if (ddUsesGpuDirectCommunication)
+            {
+                // The following must be called after local setCoordinates (which records an event
+                // when the coordinate data has been copied to the device).
+                gpuHaloExchange->communicateHaloCoordinates(box);
+
+                // TODO Force flags should include haveFreeEnergyWork for this domain
+                if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+                {
+                    //non-local part of coordinate buffer must be copied back to host for CPU work
+                    nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+                }
+            }
+            else
+            {
+                dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
+            }
  
              if (useGpuXBufOps == BufferOpsUseGpu::True)
              {
                  // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
-                if (!useGpuPme)
+                if (!useGpuPme && !ddUsesGpuDirectCommunication)
                  {
                      nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
                                                x.unpaddedArrayRef());
@@ -1403,6 +1435,13 @@ void do_force(FILE                                     *fplog,
          update_QMMMrec(cr, fr, as_rvec_array(x.unpaddedArrayRef().data()), mdatoms, box);
      }
  
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    if (ddUsesGpuDirectCommunication &&
+        (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+    {
+        /* Wait for non-local coordinate data to be copied from device */
+        nbv->wait_nonlocal_x_copy_D2H_done();
+    }
      /* Compute the bonded and non-bonded energies and optionally forces */
      do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, ms, nrnb, wcycle, mdatoms,
@@ -1419,8 +1458,10 @@ void do_force(FILE                                     *fplog,
                           forceFlags, &forceOut.forceWithVirial(), enerd,
                           ed, forceFlags.doNeighborSearch);
  
-    bool                   useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-    bool                   haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuFPmeReduction);
+    const bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    // TODO Force flags should include haveFreeEnergyWork for this domain
+    const bool haveCpuForces       = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork ||
+                                      useCpuFPmeReduction || (fr->efep != efepNO));
  
      // Will store the amount of cycles spent waiting for the GPU that
      // will be later used in the DLB accounting.
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 5caa3caee57552048aceb65916164c6917b013f2..68348803b2af02f99f7ab944f2b0c197b23363c1 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -59,6 +59,7 @@
  #include "gromacs/commandline/filenm.h"
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/localatomsetmanager.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/ewald/ewald_utils.h"
@@ -161,18 +162,25 @@
  namespace gmx
  {
  
-/*! \brief Log if development feature flags are encountered
+/*! \brief environment variable to enable GPU P2P communication */
+static const bool c_enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr)
+    && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
+
+/*! \brief Manage any development feature flag variables encountered
   *
- * The use of dev features indicated by environment variables is logged
- * in order to ensure that runs with such featrues enabled can be identified
- * from their log and standard output.
+ * The use of dev features indicated by environment variables is
+ * logged in order to ensure that runs with such featrues enabled can
+ * be identified from their log and standard output. Any cross
+ * dependencies are also checked, and if unsatisified, a fatal error
+ * issued.
   *
   * \param[in]  mdlog        Logger object.
   */
-static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static void manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
  {
      const bool enableGpuBufOps       = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
      const bool useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
+    const bool enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
  
      if (enableGpuBufOps)
      {
@@ -180,11 +188,22 @@ static void reportDevelopmentFeatures(const gmx::MDLogger &mdlog)
                  "NOTE: This run uses the 'GPU buffer ops' feature, enabled by the GMX_USE_GPU_BUFFER_OPS environment variable.");
      }
  
+    if (enableGpuHaloExchange)
+    {
+        if (!enableGpuBufOps)
+        {
+            gmx_fatal(FARGS, "Cannot enable GPU halo exchange without GPU buffer operations, set GMX_USE_GPU_BUFFER_OPS=1\n");
+        }
+        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
+    }
+
      if (useGpuUpdateConstrain)
      {
          GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
                  "NOTE: This run uses the 'GPU update/constraints' feature, enabled by the GMX_UPDATE_CONSTRAIN_GPU environment variable.");
      }
+
  }
  
  /*! \brief Barrier for safe simultaneous thread access to mdrunner data
@@ -654,7 +673,7 @@ int Mdrunner::mdrunner()
      gmx::MDLogger    mdlog(logOwner.logger());
  
      // report any development features that may be enabled by environment variables
-    reportDevelopmentFeatures(mdlog);
+    manageDevelopmentFeatures(mdlog);
  
      // With thread-MPI, the communicator changes after threads are
      // launched, so this is rebuilt for the master rank at that
@@ -1316,6 +1335,15 @@ int Mdrunner::mdrunner()
                        pforce,
                        wcycle);
  
+        // TODO Move this to happen during domain decomposition setup,
+        // once stream and event handling works well with that.
+        if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
+        {
+            void *stream                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
+            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+        }
+
          /* Initialize the mdAtoms structure.
           * mdAtoms is not filled with atom data,
           * as this can not be done now with domain decomposition.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 5c72c7faadf0bf24a875996bd35fc75b69fcb27e..9715d2985f1055b8cec5ce60f632652552681ce3 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -111,7 +111,6 @@
  #include "gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu"
  #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
-
  namespace Nbnxm
  {
  
@@ -777,6 +776,11 @@ void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid               &grid,
      copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
                         stream, GpuApiCallBehavior::Async, nullptr);
  
+    if (interactionLoc == Nbnxm::InteractionLocality::Local)
+    {
+        nb->xAvailableOnDevice->markEvent(stream);
+    }
+
      if (bDoTime)
      {
          nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
@@ -1011,6 +1015,47 @@ void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
      return;
  }
  
+void nbnxn_launch_copy_x_from_gpu(const AtomLocality               atomLocality,
+                                  const Nbnxm::GridSet            &gridSet,
+                                  gmx_nbnxn_gpu_t                 *nb,
+                                  rvec                            *x)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    GMX_ASSERT(x,  "Need a valid x pointer");
+
+    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+    cudaStream_t              stream    = nb->stream[iLocality];
+
+    bool                      bDoTime = nb->bDoTime;
+    cu_timers_t              *t       = nb->timers;
+    int                       atomStart, nAtoms;
+
+    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+    }
+
+    GMX_ASSERT(nb->xrvec,  "Need a valid nb->xrvec pointer");
+    rvec       *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
+    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
+    copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+                         stream, GpuApiCallBehavior::Async, stream);
+
+    if (atomLocality == AtomLocality::NonLocal)
+    {
+        nb->xNonLocalCopyD2HDone->markEvent(stream);
+    }
+
+    if (bDoTime)
+    {
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+    }
+
+    return;
+}
+
  void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
                                          gmx_nbnxn_gpu_t                   *nb)
  {
@@ -1024,4 +1069,19 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomL
  
  }
  
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return static_cast<void *> (gpu_nbv->xrvec);
+}
+
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
+{
+    return static_cast<void*> (nb->xAvailableOnDevice);
+}
+
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t   *nb)
+{
+    nb->xNonLocalCopyD2HDone->waitForEvent();
+}
+
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 551f5c22263f1cc16c5e3c8df349bc010d6504b7..bdeefcbb660965d271ad34ed41f5c3e477e041dd 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,6 +51,7 @@
  // TODO Remove this comment when the above order issue is resolved
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
  #include "gromacs/hardware/gpu_hw_info.h"
  #include "gromacs/math/vectypes.h"
@@ -481,6 +482,9 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
      stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
      CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
  
+    nb->xAvailableOnDevice   = new GpuEventSynchronizer();
+    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
      /* WARNING: CUDA timings are incorrect with multiple streams.
       *          This is the main reason why they are disabled by default.
       */
@@ -899,6 +903,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
          const int           atomIndicesSize   = gridSet.atomIndices().size();
          const int          *cxy_na            = grid.cxy_na().data();
          const int          *cxy_ind           = grid.cxy_ind().data();
+        // TODO Should be done once per gridset
          const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
  
          reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index c18cd3f9a24bee527b21d3ce5530a8abf9a30b4e..ec4b2b8d339592d85b5bd7e802dca928f001d1bb 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -206,6 +206,8 @@ using cu_plist_t = Nbnxm::gpu_plist;
   */
  typedef struct Nbnxm::gpu_timers_t cu_timers_t;
  
+class GpuEventSynchronizer;
+
  /** \internal
   * \brief Main data structure for CUDA nonbonded force calculations.
   */
@@ -278,6 +280,16 @@ struct gmx_nbnxn_cuda_t
      gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
  
  
+    GpuEventSynchronizer *xAvailableOnDevice;   /**< event triggered when
+                                                   coordinate buffer has been
+                                                   copied to device by PP task and
+                                                   any dependent task (e.g. transfer of coordinates
+                                                   to the PME rank's GPU) can proceed. */
+
+    GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
+                                                   non-local coordinate buffer has been
+                                                   copied from device to host*/
+
      /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
       * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 68eac205a431f5def1534b8db975dbeb39e98ebb..031033dea93e8e29df1595229e3612b87ae582b0 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -302,9 +302,32 @@ void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocali
                                   f);
  }
  
+void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
+{
+    nbnxn_launch_copy_x_from_gpu(locality,
+                                 pairSearch_->gridSet(),
+                                 gpu_nbv,
+                                 x);
+}
+
  void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
  {
      nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
  }
  
+void* nonbonded_verlet_t::get_gpu_xrvec()
+{
+    return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
+}
+
+void* nonbonded_verlet_t::get_x_on_device_event()
+{
+    return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
+}
+
+void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
+{
+    Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
+}
+
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index db550859dfffa7522a18482a09d1c28e0f1b8b8a..7a27a5cb4e10af0696e90c2d5dc87b30577c89c1 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -388,12 +388,24 @@ struct nonbonded_verlet_t
          /*! \brief D2H transfer of force buffer*/
          void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
  
+        /*! \brief D2H transfer of coordinate buffer*/
+        void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
          /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
           *
           * FIXME: need more details: when should be called / after which operation, etc.
           */
          void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
  
+        /*! \brief return GPU pointer to x in rvec format */
+        void* get_gpu_xrvec();
+
+        /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
+        void* get_x_on_device_event();
+
+        /*! \brief Wait for non-local copy of coordinate buffer from device to host */
+        void wait_nonlocal_x_copy_D2H_done();
+
          //! Return the kernel setup
          const Nbnxm::KernelSetup &kernelSetup() const
          {
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 3794c98320b0abb8d1053168b5e39d3167609722..743bcffa2034cd69d35eb2ecf52d6d7c57488f4c 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -370,12 +370,46 @@ void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocali
                                    gmx_nbnxn_gpu_t         gmx_unused *nb,
                                    rvec                    gmx_unused *f) CUDA_FUNC_TERM;
  
+/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
+ * \param[in]  atomLocality  Locality for data trasnfer
+ * \param[in]  gridSet       The Grid Set data object
+ * \param[in]  nb            The nonbonded data GPU structure
+ * \param[out] x             Coordinate buffer on CPU
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
+                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
+
  /*! \brief Wait for GPU stream to complete */
  CUDA_FUNC_QUALIFIER
  void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
                                          gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
  
+/*! \brief sync CPU thread on coordinate copy to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
  
-}     // namespace Nbnxm
+/*! \brief return pointer to event recorded when coordinates have been copied to device
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief return GPU pointer to x in rvec format
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Wait for non-local copy of coordinate buffer from device to host
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
  
+} // namespace Nbnxm
  #endif
author	Alan Gray <alang@nvidia.com>
	Tue, 26 Feb 2019 12:43:22 +0000 (04:43 -0800)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 16 Sep 2019 13:12:49 +0000 (15:12 +0200)
admin/builds/gpucomm-matrix.txt		patch \| blob \| history
src/gromacs/domdec/CMakeLists.txt		patch \| blob \| history
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec_struct.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h	[new file with mode: 0644]	patch \| blob
src/gromacs/domdec/gpuhaloexchange_impl.cpp	[new file with mode: 0755]	patch \| blob
src/gromacs/domdec/gpuhaloexchange_impl.cu	[new file with mode: 0644]	patch \| blob
src/gromacs/domdec/gpuhaloexchange_impl.cuh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history