Add missing GPU force reduction cycle counting

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h

index 71a5f712a9782eda385766b1ecd7942c7f5261bc..8aad5efaf35fe4eb82764767e2272afc6305af4e 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction.h
+++ b/src/gromacs/mdlib/gpuforcereduction.h
@@ -45,6 +45,7 @@
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
+#include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/fixedcapacityvector.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
  #include "gromacs/utility/fixedcapacityvector.h"
@@ -73,8 +74,11 @@ public:
       *
       * \param [in] deviceContext GPU device context
       * \param [in] deviceStream  Stream to use for reduction
       *
       * \param [in] deviceContext GPU device context
       * \param [in] deviceStream  Stream to use for reduction
+     * \param [in] wcycle        Wall-clock cycle counter
       */
       */
-    GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    GpuForceReduction(const DeviceContext& deviceContext,
+                      const DeviceStream&  deviceStream,
+                      gmx_wallcycle*       wcycle);
      ~GpuForceReduction();
  
      /*! \brief Register a nbnxm-format force to be reduced
      ~GpuForceReduction();
  
      /*! \brief Register a nbnxm-format force to be reduced
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index ff32deef965ed9f30177a52902020af424fc9399..69876c7d15711ad318c4c193cfd09d25e66639c1 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -57,7 +57,8 @@ class GpuForceReduction::Impl
  };
  
  GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
  };
  
  GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
-                                     const DeviceStream& /* deviceStream */) :
+                                     const DeviceStream& /* deviceStream */,
+                                     gmx_wallcycle* /*wcycle*/) :
      impl_(nullptr)
  {
      GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
      impl_(nullptr)
  {
      GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu

index 29c18daf33fcd119f6ded3f0f99f36581ae64b2e..afe7b03058f87fa9c8dbaca31250080b822e26e0 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -105,9 +105,12 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
      return;
  }
  
      return;
  }
  
-GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+                              const DeviceStream&  deviceStream,
+                              gmx_wallcycle*       wcycle) :
      deviceContext_(deviceContext),
      deviceContext_(deviceContext),
-    deviceStream_(deviceStream){};
+    deviceStream_(deviceStream),
+    wcycle_(wcycle){};
  
  void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
                                       const int             numAtoms,
  
  void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
                                       const int             numAtoms,
@@ -123,10 +126,13 @@ void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
      accumulate_       = accumulate;
      completionMarker_ = completionMarker;
      cellInfo_.cell    = cell.data();
      accumulate_       = accumulate;
      completionMarker_ = completionMarker;
      cellInfo_.cell    = cell.data();
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
                             &cellInfo_.cellSizeAlloc, deviceContext_);
      copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
                         GpuApiCallBehavior::Async, nullptr);
      reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
                             &cellInfo_.cellSizeAlloc, deviceContext_);
      copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
                         GpuApiCallBehavior::Async, nullptr);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  
      dependencyList_.clear();
  };
  
      dependencyList_.clear();
  };
@@ -150,6 +156,8 @@ void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependen
  
  void GpuForceReduction::Impl::execute()
  {
  
  void GpuForceReduction::Impl::execute()
  {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
  
      if (numAtoms_ == 0)
      {
  
      if (numAtoms_ == 0)
      {
@@ -191,12 +199,17 @@ void GpuForceReduction::Impl::execute()
      {
          completionMarker_->markEvent(deviceStream_);
      }
      {
          completionMarker_->markEvent(deviceStream_);
      }
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
  GpuForceReduction::Impl::~Impl(){};
  
  }
  
  GpuForceReduction::Impl::~Impl(){};
  
-GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
-    impl_(new Impl(deviceContext, deviceStream))
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+                                     const DeviceStream&  deviceStream,
+                                     gmx_wallcycle*       wcycle) :
+    impl_(new Impl(deviceContext, deviceStream, wcycle))
  {
  }
  
  {
  }
  
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh

index 536e3fd33b11139760681e9e98912473cb10cb4a..8434a04c712f20ee6181bbc3a8e75ea9e4c053c7 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cuh
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
@@ -73,8 +73,9 @@ public:
       *
       * \param [in] deviceStream  Stream to use for reduction
       * \param [in] deviceContext GPU device context
       *
       * \param [in] deviceStream  Stream to use for reduction
       * \param [in] deviceContext GPU device context
+     * \param [in] wcycle        The wallclock counter
       */
       */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
      ~Impl();
  
      /*! \brief Register a nbnxm-format force to be reduced
      ~Impl();
  
      /*! \brief Register a nbnxm-format force to be reduced
@@ -137,6 +138,8 @@ private:
      DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
      //! event to be marked when redcution launch has been completed
      GpuEventSynchronizer* completionMarker_ = nullptr;
      DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
      //! event to be marked when redcution launch has been completed
      GpuEventSynchronizer* completionMarker_ = nullptr;
+    //! The wallclock counter
+    gmx_wallcycle* wcycle_ = nullptr;
  };
  
  } // namespace gmx
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 9c60986fa382906011474ac80df20468189795db..4e27012f621aa242889bfc95532f482da32bc511 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1693,10 +1693,10 @@ int Mdrunner::mdrunner()
          {
              fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
          {
              fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
              fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
              fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                      deviceStreamManager->context(),
-                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+                    deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
          }
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
          }
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 22 Oct 2020 09:02:58 +0000 (09:02 +0000)
src/gromacs/mdlib/gpuforcereduction.h		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cu		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cuh		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history