GPU Force Halo Exchange

author Alan Gray <alang@nvidia.com>

Tue, 3 Sep 2019 08:00:13 +0000 (01:00 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 18 Sep 2019 13:36:32 +0000 (15:36 +0200)
author Alan Gray <alang@nvidia.com>
Tue, 3 Sep 2019 08:00:13 +0000 (01:00 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 18 Sep 2019 13:36:32 +0000 (15:36 +0200)
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index 58f908b76aaea7fb8218c926e30f0c837b9d7131..4187e45cc24ec6a6a43529fd0a75315b747b8695 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -60,22 +60,31 @@ class GpuHaloExchange
      public:
          /*! \brief Creates GPU Halo Exchange object.
           *
-         * Halo exchange will be performed in \c streamNonLocal, and
-         * the main communicateHaloCoordinates method must be called
-         * before any subsequent operations that access non-local
-         * parts of the coordinate buffer (such as the non-local
-         * non-bonded kernels). It also must be called after the local
-         * coordinates buffer operations (where the coordinates are
-         * copied to the device and hence the \c
-         * coordinatesOnDeviceEvent is recorded).
+         * Coordinate Halo exchange will be performed in \c
+         * StreamNonLocal, and the \c communicateHaloCoordinates
+         * method must be called before any subsequent operations that
+         * access non-local parts of the coordinate buffer (such as
+         * the non-local non-bonded kernels). It also must be called
+         * after the local coordinates buffer operations (where the
+         * coordinates are copied to the device and hence the \c
+         * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+         * will be performed in \c streamNonLocal (also potentally
+         * with buffer clearing in \c streamLocal)and the \c
+         * communicateHaloForces method must be called after the
+         * non-local buffer operations, after the local force buffer
+         * has been copied to the GPU (if CPU forces are present), and
+         * before the local buffer operations. The force halo exchange
+         * does not yet support virial steps.
           *
           * \param [inout] dd                       domdec structure
           * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    streamLocal              local NB CUDA stream.
           * \param [in]    streamNonLocal           non-local NB CUDA stream.
           * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
           */
          GpuHaloExchange(gmx_domdec_t *dd,
                          MPI_Comm      mpi_comm_mysim,
+                        void         *streamLocal,
                          void         *streamNonLocal,
                          void         *coordinatesOnDeviceEvent);
          ~GpuHaloExchange();
@@ -84,9 +93,10 @@ class GpuHaloExchange
           *
           * Initialization for GPU halo exchange of coordinates buffer
           * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
+         * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
           */
-        void reinitHalo(rvec *d_coordinateBuffer);
-
+        void reinitHalo(rvec        *d_coordinateBuffer,
+                        rvec        *d_forcesBuffer);
  
          /*! \brief GPU halo exchange of coordinates buffer.
           *
@@ -97,6 +107,12 @@ class GpuHaloExchange
           */
          void communicateHaloCoordinates(const matrix box);
  
+        /*! \brief GPU halo exchange of force buffer.
+         * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+         */
+        void communicateHaloForces(bool accumulateForces);
+
+
      private:
          class Impl;
          gmx::PrivateImplPointer<Impl> impl_;
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

old mode 100755 (executable)

new mode 100644 (file)

index 4b491ea..741c795
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -62,6 +62,7 @@ class GpuHaloExchange::Impl
  /*!\brief Constructor stub. */
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
                                   MPI_Comm       /* mpi_comm_mysim */,
+                                 void         * /*streamLocal */,
                                   void         * /*streamNonLocal */,
                                   void         * /*coordinatesOnDeviceEvent*/)
      : impl_(nullptr)
@@ -72,7 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
  GpuHaloExchange::~GpuHaloExchange() = default;
  
  /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */)
+void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
+                                 rvec * /* d_forcesBuffer */)
  {
      GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
  }
@@ -83,6 +85,11 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
      GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
  }
  
+/*!\brief apply F halo exchange stub. */
+void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+{
+    GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+}
  
  }      // namespace gmx
  
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 4e8ba35dd4055ec144cb223a71aa5840baf883a4..0b024fb0bd7cd0da44143241697be3df9be11178 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -37,7 +37,7 @@
   * \brief Implements GPU halo exchange using CUDA.
   *
   *
- * \author Alan Gray <alang@nvidia.com.com>
+ * \author Alan Gray <alang@nvidia.com>
   *
   * \ingroup module_domdec
   */
@@ -95,10 +95,44 @@ __global__ void packSendBufKernel(float3 * __restrict__       dataPacked,
      return;
  }
  
-void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
+/*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index information
+ * \param[out] data        full array of force values
+ * \param[in]  dataPacked  packed array of force values to be transferred
+ * \param[in]  map         array of indices defining mapping from full to packed array
+ * \param[in]  mapSize     number of elements in map array
+ */
+template <bool accumulate>
+__global__ void unpackRecvBufKernel(float3 * __restrict__       data,
+                                    const float3 * __restrict__ dataPacked,
+                                    const int * __restrict__    map,
+                                    const int                   mapSize)
+{
+
+    int           threadIndex           = blockIdx.x*blockDim.x+threadIdx.x;
+    const float3 *gm_dataSrc            = &dataPacked[threadIndex];
+    float3       *gm_dataDest           = &data[map[threadIndex]];
+
+    if (threadIndex < mapSize)
+    {
+        if (accumulate)
+        {
+            *gm_dataDest += *gm_dataSrc;
+        }
+        else
+        {
+            *gm_dataDest = *gm_dataSrc;
+        }
+    }
+
+    return;
+}
+
+void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer,
+                                       float3      *d_forcesBuffer)
  {
  
      d_x_ = d_coordinatesBuffer;
+    d_f_ = d_forcesBuffer;
  
      cudaStream_t                  stream            = nonLocalStream_;
      int                           nzone             = 1;
@@ -128,7 +162,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
      fSendSize_ = xRecvSize_;
      fRecvSize_ = xSendSize_;
  
-    localOffset_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
+    numHomeAtoms_ = comm.atomRanges.numHomeAtoms();  //offset for data recieved by this rank
  
      GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
      std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
@@ -141,7 +175,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3      *d_coordinatesBuffer)
      // since the pointers will not change until the next NS step.
  
      //Coordinates buffer:
-    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[localOffset_]);
+    void* recvPtr  = static_cast<void*> (&d_coordinatesBuffer[numHomeAtoms_]);
      MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0,
                   &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0,
                   mpi_comm_mysim_, MPI_STATUS_IGNORE);
@@ -208,6 +242,58 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
      return;
  }
  
+// The following method should be called after non-local buffer operations,
+// and before the local buffer operations. It operates in the non-local stream.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+{
+
+    // Communicate halo data (in non-local stream)
+    communicateHaloData(d_f_, HaloQuantity::HaloForces);
+
+    float3            *d_f            = d_f_;
+
+    if (!accumulateForces)
+    {
+        //Clear local portion of force array (in local stream)
+        cudaMemsetAsync(d_f, 0, numHomeAtoms_*sizeof(rvec), localStream_);
+    }
+
+    // ensure non-local stream waits for local stream, due to dependence on
+    // the previous H2D copy of CPU forces (if accumulateForces is true)
+    // or the above clearing.
+    // TODO remove this dependency on localStream - edmine issue #3093
+    GpuEventSynchronizer eventLocal;
+    eventLocal.markEvent(localStream_);
+    eventLocal.enqueueWaitEvent(nonLocalStream_);
+
+    //Unpack halo buffer into force array
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = (fRecvSize_+c_threadsPerBlock-1)/c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+    config.stream           = nonLocalStream_;
+
+    const float3    *recvBuf    = d_recvBuf_;
+    const int       *indexMap   = d_indexMap_;
+    const int        size       = fRecvSize_;
+
+    if (size > 0)
+    {
+        auto             kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
+
+        const auto       kernelArgs   = prepareGpuKernelArguments(kernelFn, config, &d_f,
+                                                                  &recvBuf, &indexMap,
+                                                                  &size);
+
+        launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+    }
+}
+
  
  void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
                                                  HaloQuantity haloQuantity)
@@ -218,6 +304,7 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
      void * remotePtr;
      int    sendRank;
      int    recvRank;
+
      if (haloQuantity == HaloQuantity::HaloCoordinates)
      {
          sendPtr   = static_cast<void*> (d_sendBuf_);
@@ -225,10 +312,16 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
          remotePtr = remoteXPtr_;
          sendRank  = sendRankX_;
          recvRank  = recvRankX_;
+
+        //Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
+        char thisTaskIsReady, remoteTaskIsReady;
+        MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0,
+                     &remoteTaskIsReady, sizeof(char), MPI_BYTE, sendRank, 0,
+                     mpi_comm_mysim_, MPI_STATUS_IGNORE);
      }
      else
      {
-        sendPtr   = static_cast<void*> (&(d_ptr[localOffset_]));
+        sendPtr   = static_cast<void*> (&(d_ptr[numHomeAtoms_]));
          sendSize  = fSendSize_;
          remotePtr = remoteFPtr_;
          sendRank  = sendRankF_;
@@ -238,7 +331,6 @@ void GpuHaloExchange::Impl::communicateHaloData(float3     * d_ptr,
      communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
  }
  
-
  void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
                                                                int   sendSize,
                                                                int   sendRank,
@@ -247,6 +339,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
  {
  
      cudaError_t  stat;
+    cudaStream_t stream = nonLocalStream_;
  
      // We asynchronously push data to remote rank. The remote
      // destination pointer has already been set in the init fn.  We
@@ -257,7 +350,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
      // send data to neighbor, if any data exists to send
      if (sendSize > 0)
      {
-        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, nonLocalStream_);
+        stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize*DIM*sizeof(float), cudaMemcpyDeviceToDevice, stream);
          CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
      }
  
@@ -267,19 +360,20 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void *sendPtr,
      // its stream.
      GpuEventSynchronizer *haloDataTransferRemote;
  
-    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+    haloDataTransferLaunched_->markEvent(stream);
  
      MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
                   &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
                   mpi_comm_mysim_, MPI_STATUS_IGNORE);
  
-    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+    haloDataTransferRemote->enqueueWaitEvent(stream);
  
  }
  
  /*! \brief Create Domdec GPU object */
  GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
                              MPI_Comm      mpi_comm_mysim,
+                            void        * localStream,
                              void        * nonLocalStream,
                              void        * coordinatesOnDeviceEvent)
      : dd_(dd),
@@ -290,6 +384,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
        usePBC_(dd->ci[dd->dim[0]] == 0),
        haloDataTransferLaunched_(new GpuEventSynchronizer()),
        mpi_comm_mysim_(mpi_comm_mysim),
+      localStream_(*static_cast<cudaStream_t*> (localStream)),
        nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
        coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
  {
@@ -323,17 +418,19 @@ GpuHaloExchange::Impl::~Impl()
  
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
                                   MPI_Comm      mpi_comm_mysim,
+                                 void         *localStream,
                                   void         *nonLocalStream,
                                   void         *coordinatesOnDeviceEvent)
-    : impl_(new Impl(dd, mpi_comm_mysim, nonLocalStream, coordinatesOnDeviceEvent))
+    : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
  {
  }
  
  GpuHaloExchange::~GpuHaloExchange() = default;
  
-void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer)
+void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer,
+                                 rvec        *d_forcesBuffer)
  {
-    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer));
+    impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
  }
  
  void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
@@ -341,4 +438,9 @@ void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
      impl_->communicateHaloCoordinates(box);
  }
  
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+{
+    impl_->communicateHaloForces(accumulateForces);
+}
+
  } //namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index bd125654eec47bd3440497a7cc39a579363dd66e..10d9118927bc45f0e0d867162bfea788ba7b227b 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -68,11 +68,13 @@ class GpuHaloExchange::Impl
           *
           * \param [inout] dd                       domdec structure
           * \param [in]    mpi_comm_mysim           communicator used for simulation
+         * \param [in]    localStream              local NB CUDA stream
           * \param [in]    nonLocalStream           non-local NB CUDA stream
           * \param [in]    coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
           */
          Impl(gmx_domdec_t *dd,
               MPI_Comm mpi_comm_mysim,
+             void *localStream,
               void *nonLocalStream,
               void *coordinatesOnDeviceEvent);
          ~Impl();
@@ -80,8 +82,10 @@ class GpuHaloExchange::Impl
          /*! \brief
           * (Re-) Initialization for GPU halo exchange
           * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+         * \param [in] d_forcesBuffer   pointer to forces buffer in GPU memory
           */
-        void reinitHalo(float3 *d_coordinatesBuffer);
+        void reinitHalo(float3 *d_coordinatesBuffer,
+                        float3 *d_forcesBuffer);
  
  
          /*! \brief
@@ -90,6 +94,11 @@ class GpuHaloExchange::Impl
           */
          void communicateHaloCoordinates(const matrix box);
  
+        /*! \brief  GPU halo exchange of force buffer
+         * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+         */
+        void communicateHaloForces(bool accumulateForces);
+
      private:
  
          /*! \brief Data transfer wrapper for GPU halo exchange
@@ -106,11 +115,11 @@ class GpuHaloExchange::Impl
           * \param [inout] remotePtr  remote address to recv data
           * \param [in] recvRank      rank to recv data from
           */
-        void communicateHaloDataWithCudaDirect(void *sendPtr,
-                                               int   sendSize,
-                                               int   sendRank,
-                                               void* remotePtr,
-                                               int   recvRank);
+        void communicateHaloDataWithCudaDirect(void        *sendPtr,
+                                               int          sendSize,
+                                               int          sendRank,
+                                               void       * remotePtr,
+                                               int          recvRank);
  
          //! Domain decomposition object
          gmx_domdec_t               *dd_                       = nullptr;
@@ -152,8 +161,8 @@ class GpuHaloExchange::Impl
          int                         fSendSize_                = 0;
          //! recv copy size to this rank for F
          int                         fRecvSize_                = 0;
-        //! offset of local halo region
-        int                         localOffset_              = 0;
+        //! number of home atoms - offset of local halo region
+        int                         numHomeAtoms_             = 0;
          //! remote GPU coordinates buffer pointer for pushing data
          void                       *remoteXPtr_               = 0;
          //! remote GPU force buffer pointer for pushing data
@@ -166,12 +175,16 @@ class GpuHaloExchange::Impl
          GpuEventSynchronizer       *haloDataTransferLaunched_ = nullptr;
          //! MPI communicator used for simulation
          MPI_Comm                    mpi_comm_mysim_;
+        //! CUDA stream for local non-bonded calculations
+        cudaStream_t                localStream_              = nullptr;
          //! CUDA stream for non-local non-bonded calculations
          cudaStream_t                nonLocalStream_           = nullptr;
          //! Event triggered when coordinates have been copied to device
          GpuEventSynchronizer       *coordinatesOnDeviceEvent_ = nullptr;
          //! full coordinates buffer in GPU memory
          float3                     *d_x_                      = nullptr;
+        //! full forces buffer in GPU memory
+        float3                     *d_f_                      = nullptr;
  
  };
  
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index aba304222fe64f7749165a16c445a1ce9960ece5..881b9974ac29d95dcd353a23ad22be4a97aa2acc 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1210,7 +1210,8 @@ void do_force(FILE                                     *fplog,
              if (ddUsesGpuDirectCommunication)
              {
                  rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
-                gpuHaloExchange->reinitHalo(d_x);
+                rvec* d_f    = static_cast<rvec *> (nbv->get_gpu_frvec());
+                gpuHaloExchange->reinitHalo(d_x, d_f);
              }
          }
          else
@@ -1519,6 +1520,12 @@ void do_force(FILE                                     *fplog,
          }
      }
  
+    const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
+    const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+                                         (fr->efep != efepNO));
+
      if (havePPDomainDecomposition(cr))
      {
          /* We are done with the CPU compute.
@@ -1530,11 +1537,27 @@ void do_force(FILE                                     *fplog,
  
          if (forceFlags.computeForces)
          {
-            if (useGpuFBufOps == BufferOpsUseGpu::True)
+            gmx::ArrayRef<gmx::RVec>  force  = forceOut.forceWithShiftForces().force();
+            rvec                     *f      = as_rvec_array(force.data());
+
+            if (useGpuForcesHaloExchange)
              {
-                nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                if (haveCpuLocalForces)
+                {
+                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                }
+                bool accumulateHaloForces = haveCpuLocalForces;
+                gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
              }
-            dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+            else
+            {
+                if (useGpuFBufOps == BufferOpsUseGpu::True)
+                {
+                    nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                }
+                dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
+            }
+
          }
      }
  
@@ -1604,12 +1627,6 @@ void do_force(FILE                                     *fplog,
      {
          gmx::ArrayRef<gmx::RVec>  forceWithShift = forceOut.forceWithShiftForces().force();
  
-
-        const bool useCpuPmeFReduction    = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
-        // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
-        const bool haveCpuLocalForces     = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
-                                             (fr->efep != efepNO));
-
          if (useGpuFBufOps == BufferOpsUseGpu::True)
          {
              // Flag to specify whether the CPU force buffer has contributions to
@@ -1622,12 +1639,23 @@ void do_force(FILE                                     *fplog,
              // - CPU f H2D should be as soon as all CPU-side forces are done
              // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
              //   before the next CPU task that consumes the forces: vsite spread or update)
-            //
+            // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
+            //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
+            //   These should be unified.
              rvec *f = as_rvec_array(forceWithShift.data());
-            if (haveLocalForceContribInCpuBuffer)
+            if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
              {
                  nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
              }
+            if (useGpuForcesHaloExchange)
+            {
+                // Add a stream synchronization to satisfy a dependency
+                // for the local buffer ops on the result of GPU halo
+                // exchange, which operates in the non-local stream and
+                // writes to to local parf og the force buffer.
+                // TODO improve this through use of an event - see Redmine #3093
+                nbv->stream_local_wait_for_nonlocal();
+            }
              nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
                                                nbv->getDeviceForces(),
                                                pme_gpu_get_device_f(fr->pmedata),
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index acca236d2b640003e66c05600383b784fdc1dd85..8d2f549c9dc0c00209ed9ca481ddf55e775b88f2 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1337,11 +1337,15 @@ int Mdrunner::mdrunner()
  
          // TODO Move this to happen during domain decomposition setup,
          // once stream and event handling works well with that.
+        // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
          if (havePPDomainDecomposition(cr) && c_enableGpuHaloExchange && useGpuForNonbonded)
          {
-            void *stream                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *streamLocal                   = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+            void *streamNonLocal                =
+                Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
              void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
-            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, stream, coordinatesOnDeviceEvent);
+            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
+                                                                        streamNonLocal, coordinatesOnDeviceEvent);
          }
  
          /* Initialize the mdAtoms structure.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 9715d2985f1055b8cec5ce60f632652552681ce3..05c0278abc9c71878138f137e0efc2d4819d07fa 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -1074,6 +1074,11 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
      return static_cast<void *> (gpu_nbv->xrvec);
  }
  
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
+{
+    return static_cast<void *> (gpu_nbv->frvec);
+}
+
  void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
  {
      return static_cast<void*> (nb->xAvailableOnDevice);
@@ -1084,4 +1089,14 @@ void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t   *nb)
      nb->xNonLocalCopyD2HDone->waitForEvent();
  }
  
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t   *nb)
+{
+    cudaStream_t         localStream     = nb->stream[InteractionLocality::Local];
+    cudaStream_t         nonLocalStream  = nb->stream[InteractionLocality::NonLocal];
+
+    GpuEventSynchronizer event;
+    event.markEvent(nonLocalStream);
+    event.enqueueWaitEvent(localStream);
+}
+
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 031033dea93e8e29df1595229e3612b87ae582b0..892098ae4ab10ac0b253f46ed7364ee9f3773780 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -330,4 +330,14 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
      Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
  }
  
+void* nonbonded_verlet_t::get_gpu_frvec()
+{
+    return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
+}
+
+void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
+{
+    Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
+}
+
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 7a27a5cb4e10af0696e90c2d5dc87b30577c89c1..caa21d9c7f0a4a72e4e09ee14db334c64a354fd3 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -406,6 +406,12 @@ struct nonbonded_verlet_t
          /*! \brief Wait for non-local copy of coordinate buffer from device to host */
          void wait_nonlocal_x_copy_D2H_done();
  
+        /*! \brief return GPU pointer to f in rvec format */
+        void* get_gpu_frvec();
+
+        /*! \brief Ensure local stream waits for non-local stream */
+        void stream_local_wait_for_nonlocal();
+
          //! Return the kernel setup
          const Nbnxm::KernelSetup &kernelSetup() const
          {
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 743bcffa2034cd69d35eb2ecf52d6d7c57488f4c..635b9d97908bd23068f75c7f450825763b4e926f 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -411,5 +411,17 @@ void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WIT
  CUDA_FUNC_QUALIFIER
  void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
  
+/*! \brief return GPU pointer to f in rvec format
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Ensure local stream waits for non-local stream
+ * \param[in] nb                   The nonbonded data GPU structure
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
+
  } // namespace Nbnxm
  #endif
author	Alan Gray <alang@nvidia.com>
	Tue, 3 Sep 2019 08:00:13 +0000 (01:00 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 18 Sep 2019 13:36:32 +0000 (15:36 +0200)
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp	[changed mode: 0755->0644]	patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history