Fix cycle counters for "comm.coord" and "Wait + Comm. F" to support GPU halo exchange...
authorGaurav Garg <gaugarg@nvidia.com>
Mon, 22 Jun 2020 16:31:13 +0000 (22:01 +0530)
committerGaurav Garg <gaugarg@nvidia.com>
Sun, 5 Jul 2020 14:54:15 +0000 (14:54 +0000)
Replace buffer ops counters with GPU launch counters for GPU backend.

src/gromacs/domdec/domdec.cpp
src/gromacs/domdec/domdec.h
src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/domdec/gpuhaloexchange_impl.cuh
src/gromacs/mdrun/md.cpp
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index 3e4996d3b623f8924fe07c154923049f9d8d58bc..fac3bf9a27c7a0a964330f31fff3c2cf512f7c1f 100644 (file)
@@ -3202,7 +3202,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
 
 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
                               const t_commrec&                cr,
-                              const gmx::DeviceStreamManager& deviceStreamManager)
+                              const gmx::DeviceStreamManager& deviceStreamManager,
+                              gmx_wallcycle*                  wcycle)
 {
     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
                        "Local non-bonded stream should be valid when using"
@@ -3233,7 +3234,7 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
             cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
                     cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
                     deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle));
         }
     }
 }
index 6c9908536cea4e19ce39585bbe570e943db8d705..b6c79dca00bba143412327ea17bd55280d832e97 100644 (file)
@@ -318,10 +318,12 @@ void dd_bonded_cg_distance(const gmx::MDLogger&           mdlog,
  * \param[in] mdlog               The logger object.
  * \param[in] cr                  The commrec object.
  * \param[in] deviceStreamManager Manager of the GPU context and streams.
+ * \param[in] wcycle              The wallclock counter.
  */
 void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
                               const t_commrec&                cr,
-                              const gmx::DeviceStreamManager& deviceStreamManager);
+                              const gmx::DeviceStreamManager& deviceStreamManager,
+                              gmx_wallcycle*                  wcycle);
 
 /*! \brief
  * (Re-) Initialization for GPU halo exchange
index b20ad0a1808003f32255ddfe86208312f57bdb13..9482009410c060ca3ebe880e5473fcdb89f7b76e 100644 (file)
@@ -49,6 +49,7 @@
 #include "gromacs/utility/gmxmpi.h"
 
 struct gmx_domdec_t;
+struct gmx_wallcycle;
 class DeviceContext;
 class DeviceStream;
 class GpuEventSynchronizer;
@@ -86,13 +87,15 @@ public:
      * \param [in]    streamLocal              local NB CUDA stream.
      * \param [in]    streamNonLocal           non-local NB CUDA stream.
      * \param [in]    pulse                    the communication pulse for this instance
+     * \param [in]    wcycle                   The wallclock counter
      */
     GpuHaloExchange(gmx_domdec_t*        dd,
                     MPI_Comm             mpi_comm_mysim,
                     const DeviceContext& deviceContext,
                     const DeviceStream&  streamLocal,
                     const DeviceStream&  streamNonLocal,
-                    int                  pulse);
+                    int                  pulse,
+                    gmx_wallcycle*       wcycle);
     ~GpuHaloExchange();
 
     /*! \brief
index e9106b7c8902a42503a6a4712831abb02be2a3d6..247c65b4e03959d02b4be10e3124d92a477b1f56 100644 (file)
@@ -65,7 +65,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                  const DeviceContext& /* deviceContext */,
                                  const DeviceStream& /*streamLocal */,
                                  const DeviceStream& /*streamNonLocal */,
-                                 int /*pulse */) :
+                                 int /*pulse */,
+                                 gmx_wallcycle* /*wcycle*/) :
     impl_(nullptr)
 {
     GMX_ASSERT(false,
index 0eebc6c0e4d3868e605a010b95e7792a4fcbdfd4..cbcff3defbc7274da10cbbbd2119fdb4cf129476 100644 (file)
@@ -61,6 +61,7 @@
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/gmxmpi.h"
 
 #include "domdec_internal.h"
@@ -205,6 +206,9 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
         coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
     }
 
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+
     // launch kernel to pack send buffer
     KernelLaunchConfig config;
     config.blockSize[0]     = c_threadsPerBlock;
@@ -243,8 +247,17 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                         "Domdec GPU Apply X Halo Exchange", kernelArgs);
     }
 
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
+    // Consider time spent in communicateHaloData as Comm.X counter
+    // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+    wallcycle_start(wcycle_, ewcMOVEX);
+
     communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
 
+    wallcycle_stop(wcycle_, ewcMOVEX);
+
     return;
 }
 
@@ -252,10 +265,18 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
 // and before the local buffer operations. It operates in the non-local stream.
 void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 {
+    // Consider time spent in communicateHaloData as Comm.F counter
+    // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+    wallcycle_start(wcycle_, ewcMOVEF);
 
     // Communicate halo data (in non-local stream)
     communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
 
+    wallcycle_stop(wcycle_, ewcMOVEF);
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+
     float3* d_f = d_f_;
 
     if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
@@ -313,6 +334,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     {
         fReadyOnDevice_.markEvent(nonLocalStream_);
     }
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 
@@ -385,6 +409,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
     {
         stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
                                cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
+
         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
     }
 
@@ -419,7 +444,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                             const DeviceContext& deviceContext,
                             const DeviceStream&  localStream,
                             const DeviceStream&  nonLocalStream,
-                            int                  pulse) :
+                            int                  pulse,
+                            gmx_wallcycle*       wcycle) :
     dd_(dd),
     sendRankX_(dd->neighbor[0][1]),
     recvRankX_(dd->neighbor[0][0]),
@@ -431,7 +457,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
     deviceContext_(deviceContext),
     localStream_(localStream),
     nonLocalStream_(nonLocalStream),
-    pulse_(pulse)
+    pulse_(pulse),
+    wcycle_(wcycle)
 {
 
     GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
@@ -466,8 +493,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                  const DeviceContext& deviceContext,
                                  const DeviceStream&  localStream,
                                  const DeviceStream&  nonLocalStream,
-                                 int                  pulse) :
-    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
+                                 int                  pulse,
+                                 gmx_wallcycle*       wcycle) :
+    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
 {
 }
 
index 9a033ed54f96d88e38f06c0f2864f8a0a9b7029f..77aaf3f9bfe1e5f5713cc90bd565d060f745aa0f 100644 (file)
@@ -52,6 +52,8 @@
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/utility/gmxmpi.h"
 
+struct gmx_wallcycle;
+
 namespace gmx
 {
 
@@ -75,13 +77,15 @@ public:
      * \param [in]    localStream              local NB CUDA stream
      * \param [in]    nonLocalStream           non-local NB CUDA stream
      * \param [in]    pulse                    the communication pulse for this instance
+     * \param [in]    wcycle                   The wallclock counter
      */
     Impl(gmx_domdec_t*        dd,
          MPI_Comm             mpi_comm_mysim,
          const DeviceContext& deviceContext,
          const DeviceStream&  localStream,
          const DeviceStream&  nonLocalStream,
-         int                  pulse);
+         int                  pulse,
+         gmx_wallcycle*       wcycle);
     ~Impl();
 
     /*! \brief
@@ -198,6 +202,8 @@ private:
     int pulse_ = 0;
     //! Number of zones. Always 1 for 1-D case.
     const int nzone_ = 1;
+    //! The wallclock counter
+    gmx_wallcycle* wcycle_ = nullptr;
 };
 
 } // namespace gmx
index 9ee0697c84d6c171340871cf7e4c45752d64edf2..5f5f863e816b40c8f93b9898ee1ccc67dde07a62 100644 (file)
@@ -874,7 +874,7 @@ void gmx::LegacySimulator::do_md()
                                        "GPU device manager has to be initialized to use GPU "
                                        "version of halo exchange.");
                     // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
                 }
             }
         }
index 20495969e0a9aa42ee464e40df15d0822d504ff5..67890a96069099a9ecc9f73c08bb27919a49b9ac 100644 (file)
@@ -145,13 +145,13 @@ void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                DeviceBuffer<gmx::RVec> d_x,
                                                GpuEventSynchronizer*   xReadyOnDevice)
 {
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
 
     nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, d_x, xReadyOnDevice);
 
-    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
@@ -198,14 +198,14 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality lo
         return;
     }
 
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
 
     reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice,
                     dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
 
-    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
index 8c628fab39b43ec10f0294ea335af9e06ab16720..4e3e241d9660d6e9f15f582b6c9ce2e362150037 100644 (file)
@@ -175,6 +175,10 @@ static const char* wcsn[ewcsNR] = {
     "NB X buffer ops.",
     "NB F buffer ops.",
     "Clear force buffer",
+    "Launch GPU NB X buffer ops.",
+    "Launch GPU NB F buffer ops.",
+    "Launch GPU Comm. coord.",
+    "Launch GPU Comm. force."
     "Test subcounter",
 };
 
index 785b53cd8ebde6b72098d42d606a1a52aaa0484d..83be456671553574acb6d265310769794f342278 100644 (file)
@@ -127,6 +127,10 @@ enum
     ewcsNB_X_BUF_OPS,
     ewcsNB_F_BUF_OPS,
     ewcsCLEAR_FORCE_BUFFER,
+    ewcsLAUNCH_GPU_NB_X_BUF_OPS,
+    ewcsLAUNCH_GPU_NB_F_BUF_OPS,
+    ewcsLAUNCH_GPU_MOVEX,
+    ewcsLAUNCH_GPU_MOVEF,
     ewcsTEST,
     ewcsNR
 };