Fix cycle counters for "comm.coord" and "Wait + Comm. F" to support GPU halo exchange...

author Gaurav Garg <gaugarg@nvidia.com>

Mon, 22 Jun 2020 16:31:13 +0000 (22:01 +0530)

committer Gaurav Garg <gaugarg@nvidia.com>

Sun, 5 Jul 2020 14:54:15 +0000 (14:54 +0000)
author Gaurav Garg <gaugarg@nvidia.com>
Mon, 22 Jun 2020 16:31:13 +0000 (22:01 +0530)
committer Gaurav Garg <gaugarg@nvidia.com>
Sun, 5 Jul 2020 14:54:15 +0000 (14:54 +0000)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 3e4996d3b623f8924fe07c154923049f9d8d58bc..fac3bf9a27c7a0a964330f31fff3c2cf512f7c1f 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3202,7 +3202,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
  
  void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
                                const t_commrec&                cr,
-                              const gmx::DeviceStreamManager& deviceStreamManager)
+                              const gmx::DeviceStreamManager& deviceStreamManager,
+                              gmx_wallcycle*                  wcycle)
  {
      GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
                         "Local non-bonded stream should be valid when using"
@@ -3233,7 +3234,7 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
              cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
                      cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
                      deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle));
          }
      }
  }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index 6c9908536cea4e19ce39585bbe570e943db8d705..b6c79dca00bba143412327ea17bd55280d832e97 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -318,10 +318,12 @@ void dd_bonded_cg_distance(const gmx::MDLogger&           mdlog,
   * \param[in] mdlog               The logger object.
   * \param[in] cr                  The commrec object.
   * \param[in] deviceStreamManager Manager of the GPU context and streams.
+ * \param[in] wcycle              The wallclock counter.
   */
  void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
                                const t_commrec&                cr,
-                              const gmx::DeviceStreamManager& deviceStreamManager);
+                              const gmx::DeviceStreamManager& deviceStreamManager,
+                              gmx_wallcycle*                  wcycle);
  
  /*! \brief
   * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index b20ad0a1808003f32255ddfe86208312f57bdb13..9482009410c060ca3ebe880e5473fcdb89f7b76e 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -49,6 +49,7 @@
  #include "gromacs/utility/gmxmpi.h"
  
  struct gmx_domdec_t;
+struct gmx_wallcycle;
  class DeviceContext;
  class DeviceStream;
  class GpuEventSynchronizer;
@@ -86,13 +87,15 @@ public:
       * \param [in]    streamLocal              local NB CUDA stream.
       * \param [in]    streamNonLocal           non-local NB CUDA stream.
       * \param [in]    pulse                    the communication pulse for this instance
+     * \param [in]    wcycle                   The wallclock counter
       */
      GpuHaloExchange(gmx_domdec_t*        dd,
                      MPI_Comm             mpi_comm_mysim,
                      const DeviceContext& deviceContext,
                      const DeviceStream&  streamLocal,
                      const DeviceStream&  streamNonLocal,
-                    int                  pulse);
+                    int                  pulse,
+                    gmx_wallcycle*       wcycle);
      ~GpuHaloExchange();
  
      /*! \brief
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index e9106b7c8902a42503a6a4712831abb02be2a3d6..247c65b4e03959d02b4be10e3124d92a477b1f56 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -65,7 +65,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                   const DeviceContext& /* deviceContext */,
                                   const DeviceStream& /*streamLocal */,
                                   const DeviceStream& /*streamNonLocal */,
-                                 int /*pulse */) :
+                                 int /*pulse */,
+                                 gmx_wallcycle* /*wcycle*/) :
      impl_(nullptr)
  {
      GMX_ASSERT(false,
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 0eebc6c0e4d3868e605a010b95e7792a4fcbdfd4..cbcff3defbc7274da10cbbbd2119fdb4cf129476 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -61,6 +61,7 @@
  #include "gromacs/gpu_utils/vectype_ops.cuh"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/gmxmpi.h"
  
  #include "domdec_internal.h"
@@ -205,6 +206,9 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
          coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
      }
  
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+
      // launch kernel to pack send buffer
      KernelLaunchConfig config;
      config.blockSize[0]     = c_threadsPerBlock;
@@ -243,8 +247,17 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                          "Domdec GPU Apply X Halo Exchange", kernelArgs);
      }
  
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
+    // Consider time spent in communicateHaloData as Comm.X counter
+    // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+    wallcycle_start(wcycle_, ewcMOVEX);
+
      communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
  
+    wallcycle_stop(wcycle_, ewcMOVEX);
+
      return;
  }
  
@@ -252,10 +265,18 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
  // and before the local buffer operations. It operates in the non-local stream.
  void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
  {
+    // Consider time spent in communicateHaloData as Comm.F counter
+    // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+    wallcycle_start(wcycle_, ewcMOVEF);
  
      // Communicate halo data (in non-local stream)
      communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
  
+    wallcycle_stop(wcycle_, ewcMOVEF);
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+
      float3* d_f = d_f_;
  
      if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
@@ -313,6 +334,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      {
          fReadyOnDevice_.markEvent(nonLocalStream_);
      }
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
  
@@ -385,6 +409,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
      {
          stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
                                 cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
+
          CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
      }
  
@@ -419,7 +444,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                              const DeviceContext& deviceContext,
                              const DeviceStream&  localStream,
                              const DeviceStream&  nonLocalStream,
-                            int                  pulse) :
+                            int                  pulse,
+                            gmx_wallcycle*       wcycle) :
      dd_(dd),
      sendRankX_(dd->neighbor[0][1]),
      recvRankX_(dd->neighbor[0][0]),
@@ -431,7 +457,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
      deviceContext_(deviceContext),
      localStream_(localStream),
      nonLocalStream_(nonLocalStream),
-    pulse_(pulse)
+    pulse_(pulse),
+    wcycle_(wcycle)
  {
  
      GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
@@ -466,8 +493,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                   const DeviceContext& deviceContext,
                                   const DeviceStream&  localStream,
                                   const DeviceStream&  nonLocalStream,
-                                 int                  pulse) :
-    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
+                                 int                  pulse,
+                                 gmx_wallcycle*       wcycle) :
+    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
  {
  }
  
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 9a033ed54f96d88e38f06c0f2864f8a0a9b7029f..77aaf3f9bfe1e5f5713cc90bd565d060f745aa0f 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -52,6 +52,8 @@
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/utility/gmxmpi.h"
  
+struct gmx_wallcycle;
+
  namespace gmx
  {
  
@@ -75,13 +77,15 @@ public:
       * \param [in]    localStream              local NB CUDA stream
       * \param [in]    nonLocalStream           non-local NB CUDA stream
       * \param [in]    pulse                    the communication pulse for this instance
+     * \param [in]    wcycle                   The wallclock counter
       */
      Impl(gmx_domdec_t*        dd,
           MPI_Comm             mpi_comm_mysim,
           const DeviceContext& deviceContext,
           const DeviceStream&  localStream,
           const DeviceStream&  nonLocalStream,
-         int                  pulse);
+         int                  pulse,
+         gmx_wallcycle*       wcycle);
      ~Impl();
  
      /*! \brief
@@ -198,6 +202,8 @@ private:
      int pulse_ = 0;
      //! Number of zones. Always 1 for 1-D case.
      const int nzone_ = 1;
+    //! The wallclock counter
+    gmx_wallcycle* wcycle_ = nullptr;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 9ee0697c84d6c171340871cf7e4c45752d64edf2..5f5f863e816b40c8f93b9898ee1ccc67dde07a62 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -874,7 +874,7 @@ void gmx::LegacySimulator::do_md()
                                         "GPU device manager has to be initialized to use GPU "
                                         "version of halo exchange.");
                      // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
                  }
              }
          }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 20495969e0a9aa42ee464e40df15d0822d504ff5..67890a96069099a9ecc9f73c08bb27919a49b9ac 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -145,13 +145,13 @@ void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                 DeviceBuffer<gmx::RVec> d_x,
                                                 GpuEventSynchronizer*   xReadyOnDevice)
  {
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
  
      nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, d_x, xReadyOnDevice);
  
-    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
  gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
@@ -198,14 +198,14 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality lo
          return;
      }
  
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
  
      reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice,
                      dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
  
-    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
  void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp

index 8c628fab39b43ec10f0294ea335af9e06ab16720..4e3e241d9660d6e9f15f582b6c9ce2e362150037 100644 (file)
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -175,6 +175,10 @@ static const char* wcsn[ewcsNR] = {
      "NB X buffer ops.",
      "NB F buffer ops.",
      "Clear force buffer",
+    "Launch GPU NB X buffer ops.",
+    "Launch GPU NB F buffer ops.",
+    "Launch GPU Comm. coord.",
+    "Launch GPU Comm. force."
      "Test subcounter",
  };
  
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index 785b53cd8ebde6b72098d42d606a1a52aaa0484d..83be456671553574acb6d265310769794f342278 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -127,6 +127,10 @@ enum
      ewcsNB_X_BUF_OPS,
      ewcsNB_F_BUF_OPS,
      ewcsCLEAR_FORCE_BUFFER,
+    ewcsLAUNCH_GPU_NB_X_BUF_OPS,
+    ewcsLAUNCH_GPU_NB_F_BUF_OPS,
+    ewcsLAUNCH_GPU_MOVEX,
+    ewcsLAUNCH_GPU_MOVEF,
      ewcsTEST,
      ewcsNR
  };
author	Gaurav Garg <gaugarg@nvidia.com>
	Mon, 22 Jun 2020 16:31:13 +0000 (22:01 +0530)
committer	Gaurav Garg <gaugarg@nvidia.com>
	Sun, 5 Jul 2020 14:54:15 +0000 (14:54 +0000)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/timing/wallcycle.cpp		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history