Add missing wallcycle_stop
authorSzilárd Páll <pall.szilard@gmail.com>
Tue, 3 Nov 2020 12:47:16 +0000 (12:47 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Tue, 3 Nov 2020 12:47:16 +0000 (12:47 +0000)
On the GPU update path the Update CPU counter was started and not
correctly stopped.

Refs #3764

src/gromacs/mdlib/update_constrain_gpu.h
src/gromacs/mdlib/update_constrain_gpu_impl.cpp
src/gromacs/mdlib/update_constrain_gpu_impl.cu
src/gromacs/mdlib/update_constrain_gpu_impl.h
src/gromacs/mdrun/md.cpp
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index a30cbe825d305c92ccf9c211014bda5032b52dcc..581851f9a271cd93929b0f62ce8e145a2e92f98d 100644 (file)
@@ -46,6 +46,7 @@
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/mdtypes/group.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
 
@@ -69,10 +70,10 @@ public:
     /*! \brief Create Update-Constrain object.
      *
      * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
-     * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
-     * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
-     * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
-     * markEvent(...) method is called unconditionally.
+     * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that
+     * modify coordinates. The event is maintained outside this class and also passed to all (if
+     * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr
+     * because the markEvent(...) method is called unconditionally.
      *
      * \param[in] ir                Input record data: LINCS takes number of iterations and order of
      *                              projection from it.
@@ -80,13 +81,16 @@ public:
      *                              and target O-H and H-H distances from this object.
      * \param[in] deviceContext     GPU device context.
      * \param[in] deviceStream      GPU stream to use.
-     * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
+     * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done
+     *                              on the GPU.
+     * \param[in] wcycle            The wallclock counter
      */
     UpdateConstrainGpu(const t_inputrec&     ir,
                        const gmx_mtop_t&     mtop,
                        const DeviceContext&  deviceContext,
                        const DeviceStream&   deviceStream,
-                       GpuEventSynchronizer* xUpdatedOnDevice);
+                       GpuEventSynchronizer* xUpdatedOnDevice,
+                       gmx_wallcycle*        wcycle);
 
     ~UpdateConstrainGpu();
 
index 5c4afd2acd0757b48298ff9f3b8220ab71250319..dc2b0421c3d8ce1ce1e1e0e959bdb6a7dc3abd67 100644 (file)
@@ -60,7 +60,8 @@ UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
                                        const gmx_mtop_t& /* mtop */,
                                        const DeviceContext& /* deviceContext */,
                                        const DeviceStream& /* deviceStream */,
-                                       GpuEventSynchronizer* /* xUpdatedOnDevice */) :
+                                       GpuEventSynchronizer* /* xUpdatedOnDevice */,
+                                       gmx_wallcycle* /*wcycle*/) :
     impl_(nullptr)
 {
     GMX_ASSERT(!impl_,
index b9dd8632db293e8a7ad3e60e72ab0d1601ced86c..825890ce82617273dd07e7d048fa05ee55d7b3e8 100644 (file)
@@ -67,6 +67,7 @@
 #include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/timing/wallcycle.h"
 
 namespace gmx
 {
@@ -116,6 +117,9 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
                                          const float                       dtPressureCouple,
                                          const matrix                      prVelocityScalingMatrix)
 {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
     // Clearing virial matrix
     // TODO There is no point in having separate virial matrix for constraints
     clear_mat(virial);
@@ -145,11 +149,17 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
 
     coordinatesReady_->markEvent(deviceStream_);
 
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
     return;
 }
 
 void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
 {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
     ScalingMatrix mu;
     mu.xx = scalingMatrix[XX][XX];
     mu.yy = scalingMatrix[YY][YY];
@@ -165,10 +175,16 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
     // TODO: Although this only happens on the pressure coupling steps, this synchronization
     //       can affect the performance if nstpcouple is small.
     deviceStream_.synchronize();
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
 {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
     ScalingMatrix mu;
     mu.xx = scalingMatrix[XX][XX];
     mu.yy = scalingMatrix[YY][YY];
@@ -184,16 +200,21 @@ void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
     // TODO: Although this only happens on the pressure coupling steps, this synchronization
     //       can affect the performance if nstpcouple is small.
     deviceStream_.synchronize();
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                                const gmx_mtop_t&     mtop,
                                const DeviceContext&  deviceContext,
                                const DeviceStream&   deviceStream,
-                               GpuEventSynchronizer* xUpdatedOnDevice) :
+                               GpuEventSynchronizer* xUpdatedOnDevice,
+                               gmx_wallcycle*        wcycle) :
     deviceContext_(deviceContext),
     deviceStream_(deviceStream),
-    coordinatesReady_(xUpdatedOnDevice)
+    coordinatesReady_(xUpdatedOnDevice),
+    wcycle_(wcycle)
 {
     GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
 
@@ -217,6 +238,10 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
                                    const t_mdatoms&              md,
                                    const int                     numTempScaleValues)
 {
+    // TODO wallcycle
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
     GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
     GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
     GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
@@ -239,10 +264,14 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
 
     coordinateScalingKernelLaunchConfig_.gridSize[0] =
             (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
 {
+    // TODO wallcycle
     setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_);
 }
 
@@ -255,8 +284,9 @@ UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
                                        const gmx_mtop_t&     mtop,
                                        const DeviceContext&  deviceContext,
                                        const DeviceStream&   deviceStream,
-                                       GpuEventSynchronizer* xUpdatedOnDevice) :
-    impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice))
+                                       GpuEventSynchronizer* xUpdatedOnDevice,
+                                       gmx_wallcycle*        wcycle) :
+    impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice, wcycle))
 {
 }
 
index 9ee067791318d5d70ee2c34c3029711ad06220ad..7453a98105712aca5264b16aa737789d8f53e213 100644 (file)
@@ -66,10 +66,10 @@ public:
     /*! \brief Create Update-Constrain object.
      *
      * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
-     * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
-     * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
-     * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
-     * markEvent(...) method is called unconditionally.
+     * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that
+     * modify coordinates. The event is maintained outside this class and also passed to all (if
+     * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr
+     * because the markEvent(...) method is called unconditionally.
      *
      * \param[in] ir                Input record data: LINCS takes number of iterations and order of
      *                              projection from it.
@@ -77,13 +77,16 @@ public:
      *                              and target O-H and H-H distances from this object.
      * \param[in] deviceContext     GPU device context.
      * \param[in] deviceStream      GPU stream to use.
-     * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that update is done on the GPU.
+     * \param[in] xUpdatedOnDevice  The event synchronizer to use to mark that
+     *                              update is done on the GPU.
+     * \param[in] wcycle            The wallclock counter
      */
     Impl(const t_inputrec&     ir,
          const gmx_mtop_t&     mtop,
          const DeviceContext&  deviceContext,
          const DeviceStream&   deviceStream,
-         GpuEventSynchronizer* xUpdatedOnDevice);
+         GpuEventSynchronizer* xUpdatedOnDevice,
+         gmx_wallcycle*        wcycle);
 
     ~Impl();
 
@@ -220,6 +223,8 @@ private:
 
     //! An pointer to the event to indicate when the update of coordinates is complete
     GpuEventSynchronizer* coordinatesReady_;
+    //! The wallclock counter
+    gmx_wallcycle* wcycle_ = nullptr;
 };
 
 } // namespace gmx
index f010e6a10a86a450fe4bf0a66a7cf05fcb752126..c20574e37fb4494c93e330ae67e924711c0d5be1 100644 (file)
@@ -415,7 +415,7 @@ void gmx::LegacySimulator::do_md()
         integrator = std::make_unique<UpdateConstrainGpu>(
                 *ir, *top_global, fr->deviceStreamManager->context(),
                 fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
-                stateGpu->xUpdatedOnDevice());
+                stateGpu->xUpdatedOnDevice(), wcycle);
 
         integrator->setPbc(PbcType::Xyz, state->box);
     }
@@ -1255,6 +1255,8 @@ void gmx::LegacySimulator::do_md()
 
         if (useGpuForUpdate)
         {
+            wallcycle_stop(wcycle, ewcUPDATE);
+
             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
             {
                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
index e69df666c1d9f40c340856b8774fbd5b07c3a852..f1b428bf05ccaffc7691d4a36204cef54f5c919c 100644 (file)
@@ -180,6 +180,7 @@ static const char* wcsn[ewcsNR] = {
     "Launch GPU NB F buffer ops.",
     "Launch GPU Comm. coord.",
     "Launch GPU Comm. force.",
+    "Launch GPU update",
     "Test subcounter",
 };
 
index 0bc53f0fe69f9db152776eaf2230b9c4b36001ea..2cbc7fd3cb192ba6fe18ea7f4b3ba1afde2ffe7d 100644 (file)
@@ -132,6 +132,7 @@ enum
     ewcsLAUNCH_GPU_NB_F_BUF_OPS,
     ewcsLAUNCH_GPU_MOVEX,
     ewcsLAUNCH_GPU_MOVEF,
+    ewcsLAUNCH_GPU_UPDATE_CONSTRAIN,
     ewcsTEST,
     ewcsNR
 };