Add missing GPU force reduction cycle counting
authorSzilárd Páll <pall.szilard@gmail.com>
Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
Launch timing was missing in the GPU reduction refactoring added in
257d80. This change adds back the cycle counting reducing the time
leaking into the rest timer.

src/gromacs/mdlib/gpuforcereduction.h
src/gromacs/mdlib/gpuforcereduction_impl.cpp
src/gromacs/mdlib/gpuforcereduction_impl.cu
src/gromacs/mdlib/gpuforcereduction_impl.cuh
src/gromacs/mdrun/runner.cpp

index 71a5f712a9782eda385766b1ecd7942c7f5261bc..8aad5efaf35fe4eb82764767e2272afc6305af4e 100644 (file)
@@ -45,6 +45,7 @@
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/fixedcapacityvector.h"
@@ -73,8 +74,11 @@ public:
      *
      * \param [in] deviceContext GPU device context
      * \param [in] deviceStream  Stream to use for reduction
+     * \param [in] wcycle        Wall-clock cycle counter
      */
-    GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    GpuForceReduction(const DeviceContext& deviceContext,
+                      const DeviceStream&  deviceStream,
+                      gmx_wallcycle*       wcycle);
     ~GpuForceReduction();
 
     /*! \brief Register a nbnxm-format force to be reduced
index ff32deef965ed9f30177a52902020af424fc9399..69876c7d15711ad318c4c193cfd09d25e66639c1 100644 (file)
@@ -57,7 +57,8 @@ class GpuForceReduction::Impl
 };
 
 GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
-                                     const DeviceStream& /* deviceStream */) :
+                                     const DeviceStream& /* deviceStream */,
+                                     gmx_wallcycle* /*wcycle*/) :
     impl_(nullptr)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
index 29c18daf33fcd119f6ded3f0f99f36581ae64b2e..afe7b03058f87fa9c8dbaca31250080b822e26e0 100644 (file)
@@ -105,9 +105,12 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
     return;
 }
 
-GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+                              const DeviceStream&  deviceStream,
+                              gmx_wallcycle*       wcycle) :
     deviceContext_(deviceContext),
-    deviceStream_(deviceStream){};
+    deviceStream_(deviceStream),
+    wcycle_(wcycle){};
 
 void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
                                      const int             numAtoms,
@@ -123,10 +126,13 @@ void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
     accumulate_       = accumulate;
     completionMarker_ = completionMarker;
     cellInfo_.cell    = cell.data();
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
                            &cellInfo_.cellSizeAlloc, deviceContext_);
     copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
                        GpuApiCallBehavior::Async, nullptr);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 
     dependencyList_.clear();
 };
@@ -150,6 +156,8 @@ void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependen
 
 void GpuForceReduction::Impl::execute()
 {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
 
     if (numAtoms_ == 0)
     {
@@ -191,12 +199,17 @@ void GpuForceReduction::Impl::execute()
     {
         completionMarker_->markEvent(deviceStream_);
     }
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 GpuForceReduction::Impl::~Impl(){};
 
-GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
-    impl_(new Impl(deviceContext, deviceStream))
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+                                     const DeviceStream&  deviceStream,
+                                     gmx_wallcycle*       wcycle) :
+    impl_(new Impl(deviceContext, deviceStream, wcycle))
 {
 }
 
index 536e3fd33b11139760681e9e98912473cb10cb4a..8434a04c712f20ee6181bbc3a8e75ea9e4c053c7 100644 (file)
@@ -73,8 +73,9 @@ public:
      *
      * \param [in] deviceStream  Stream to use for reduction
      * \param [in] deviceContext GPU device context
+     * \param [in] wcycle        The wallclock counter
      */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
     ~Impl();
 
     /*! \brief Register a nbnxm-format force to be reduced
@@ -137,6 +138,8 @@ private:
     DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
     //! event to be marked when redcution launch has been completed
     GpuEventSynchronizer* completionMarker_ = nullptr;
+    //! The wallclock counter
+    gmx_wallcycle* wcycle_ = nullptr;
 };
 
 } // namespace gmx
index 9c60986fa382906011474ac80df20468189795db..4e27012f621aa242889bfc95532f482da32bc511 100644 (file)
@@ -1693,10 +1693,10 @@ int Mdrunner::mdrunner()
         {
             fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                     deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
             fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                     deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
         }
 
         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;