Move cycle counting into the GPU bonded module
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 26 Jul 2019 18:31:49 +0000 (20:31 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 31 Jul 2019 07:36:06 +0000 (09:36 +0200)
Additionally:
- removed some outdated TODOs;
- renamed accumulateEnergyTerms() to waitAccumulateEnergyTerms()
to reflect that it does a blocking wait for results.

Change-Id: I1bd7f1ddecea94ff2696721c9f2d6eac45bea738

src/gromacs/listed_forces/gpubonded.h
src/gromacs/listed_forces/gpubonded_impl.cpp
src/gromacs/listed_forces/gpubonded_impl.cu
src/gromacs/listed_forces/gpubonded_impl.h
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/forcerec.h
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp

index df63d675d074ee7188d37684084da11893330de8..6ea5541f3ee580f021d4c187ba9d51912b2c1027 100644 (file)
@@ -59,6 +59,7 @@ struct gmx_mtop_t;
 struct t_forcerec;
 struct t_idef;
 struct t_inputrec;
+struct gmx_wallcycle;
 
 /*! \brief The number on bonded function types supported on GPUs */
 static constexpr int numFTypesOnGpu = 8;
@@ -112,7 +113,8 @@ class GpuBonded
     public:
         //! Construct the manager with constant data and the stream to use.
         GpuBonded(const gmx_ffparams_t &ffparams,
-                  void                 *streamPtr);
+                  void                 *streamPtr,
+                  gmx_wallcycle        *wcycle);
         //! Destructor
         ~GpuBonded();
 
@@ -138,7 +140,7 @@ class GpuBonded
         /*! \brief Launches the transfer of computed bonded energies. */
         void launchEnergyTransfer();
         /*! \brief Waits on the energy transfer, and accumulates bonded energies to \c enerd. */
-        void accumulateEnergyTerms(gmx_enerdata_t *enerd);
+        void waitAccumulateEnergyTerms(gmx_enerdata_t *enerd);
         /*! \brief Clears the device side energy buffer */
         void clearEnergies();
 
index 328b225fa57e0008316b52d9494af72ba87918af..178d241a59c700322e3c6f7faad9b672e2165035 100644 (file)
@@ -165,7 +165,8 @@ class GpuBonded::Impl
 };
 
 GpuBonded::GpuBonded(const gmx_ffparams_t & /* ffparams */,
-                     void                 * /*streamPtr */)
+                     void                 * /*streamPtr */,
+                     gmx_wallcycle        * /* wcycle */)
     : impl_(nullptr)
 {
 }
@@ -200,7 +201,7 @@ GpuBonded::launchEnergyTransfer()
 }
 
 void
-GpuBonded::accumulateEnergyTerms(gmx_enerdata_t * /* enerd */)
+GpuBonded::waitAccumulateEnergyTerms(gmx_enerdata_t * /* enerd */)
 {
 }
 
index d269c6918bcf6757f204f2dd58b2e1b049e6aa84..e71738f586c160bbb41ef7896b9eadff3812049c 100644 (file)
@@ -52,6 +52,7 @@
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/topology/forcefieldparameters.h"
 
 struct t_forcerec;
@@ -62,9 +63,11 @@ namespace gmx
 // ---- GpuBonded::Impl
 
 GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
-                      void                 *streamPtr)
+                      void                 *streamPtr,
+                      gmx_wallcycle        *wcycle)
 {
     stream_ = *static_cast<CommandStream*>(streamPtr);
+    wcycle_ = wcycle;
 
     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
     // This could be an async transfer (if the source is pinned), so
@@ -263,25 +266,26 @@ GpuBonded::Impl::haveInteractions() const
 void
 GpuBonded::Impl::launchEnergyTransfer()
 {
-    // TODO should wrap with ewcLAUNCH_GPU
     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
 
+    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
     // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
     float *h_vTot   = vTot_.data();
     copyFromDeviceBuffer(h_vTot, &d_vTot_,
                          0, F_NRE,
                          stream_, GpuApiCallBehavior::Async, nullptr);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
 }
 
 void
-GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd)
+GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t *enerd)
 {
-    // TODO should wrap with some kind of wait counter, so not all
-    // wait goes in to the "Rest" counter
     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur");
 
+    wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
     cudaError_t stat = cudaStreamSynchronize(stream_);
     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
+    wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
 
     for (int fType : fTypesOnGpu)
     {
@@ -301,15 +305,19 @@ GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd)
 void
 GpuBonded::Impl::clearEnergies()
 {
-    // TODO should wrap with ewcLAUNCH_GPU
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 // ---- GpuBonded
 
 GpuBonded::GpuBonded(const gmx_ffparams_t &ffparams,
-                     void                 *streamPtr)
-    : impl_(new Impl(ffparams, streamPtr))
+                     void                 *streamPtr,
+                     gmx_wallcycle        *wcycle)
+    : impl_(new Impl(ffparams, streamPtr, wcycle))
 {
 }
 
@@ -339,9 +347,9 @@ GpuBonded::launchEnergyTransfer()
 }
 
 void
-GpuBonded::accumulateEnergyTerms(gmx_enerdata_t *enerd)
+GpuBonded::waitAccumulateEnergyTerms(gmx_enerdata_t *enerd)
 {
-    impl_->accumulateEnergyTerms(enerd);
+    impl_->waitAccumulateEnergyTerms(enerd);
 }
 
 void
index f33f6453246ac60bea10dc0c330d39c4e888b6fc..436cc88e566a3ec6399fc0ac871a0d2c15f29efa 100644 (file)
@@ -130,7 +130,8 @@ class GpuBonded::Impl
     public:
         //! Constructor
         Impl(const gmx_ffparams_t &ffparams,
-             void                 *streamPtr);
+             void                 *streamPtr,
+             gmx_wallcycle        *wcycle);
         /*! \brief Destructor, non-default needed for freeing
          * device-side buffers */
         ~Impl();
@@ -158,7 +159,7 @@ class GpuBonded::Impl
         /*! \brief Launches the transfer of computed bonded energies. */
         void launchEnergyTransfer();
         /*! \brief Waits on the energy transfer, and accumulates bonded energies to \c enerd. */
-        void accumulateEnergyTerms(gmx_enerdata_t *enerd);
+        void waitAccumulateEnergyTerms(gmx_enerdata_t *enerd);
         /*! \brief Clears the device side energy buffer */
         void clearEnergies();
     private:
@@ -190,6 +191,9 @@ class GpuBonded::Impl
 
         //! Parameters and pointers, passed to the CUDA kernel
         BondedCudaKernelParameters kernelParams_;
+
+        //! \brief Pointer to wallcycle structure.
+        gmx_wallcycle    *wcycle_;
 };
 
 }   // namespace gmx
index a76f1d648a452317ace3fa7914c83600725cc355..af151e519d316417449bd02bf0605a768457a740 100644 (file)
@@ -1469,7 +1469,8 @@ void init_forcerec(FILE                             *fp,
                    const gmx_device_info_t          *deviceInfo,
                    const bool                        useGpuForBonded,
                    gmx_bool                          bNoSolvOpt,
-                   real                              print_force)
+                   real                              print_force,
+                   gmx_wallcycle                    *wcycle)
 {
     real           rtab;
     char          *env;
@@ -1989,7 +1990,8 @@ void init_forcerec(FILE                             *fp,
             // TODO the heap allocation is only needed while
             // t_forcerec lacks a constructor.
             fr->gpuBonded = new gmx::GpuBonded(mtop->ffparams,
-                                               stream);
+                                               stream,
+                                               wcycle);
         }
     }
 
index 56867ad2e70e360c299fcdcfe4de6a12426089e1..5ac1a3b2ffe2bcfd693d143c88ee5656f327bfa1 100644 (file)
@@ -51,6 +51,7 @@ struct t_fcdata;
 struct t_filenm;
 struct t_inputrec;
 struct gmx_gpu_info_t;
+struct gmx_wallcycle;
 
 namespace gmx
 {
@@ -113,6 +114,7 @@ void init_interaction_const_tables(FILE                   *fp,
  * \param[in]  useGpuForBonded  Whether bonded interactions will run on a GPU
  * \param[in]  bNoSolvOpt  Do not use solvent optimization
  * \param[in]  print_force Print forces for atoms with force >= print_force
+ * \param[out] wcycle      Pointer to cycle counter object
  */
 void init_forcerec(FILE                             *fplog,
                    const gmx::MDLogger              &mdlog,
@@ -129,7 +131,8 @@ void init_forcerec(FILE                             *fplog,
                    const gmx_device_info_t          *deviceInfo,
                    bool                              useGpuForBonded,
                    gmx_bool                          bNoSolvOpt,
-                   real                              print_force);
+                   real                              print_force,
+                   gmx_wallcycle                    *wcycle);
 
 /*! \brief Divide exclusions over threads
  *
index 309ab61c10d44162020ec8a1baac007a7dc6538b..a1f2f5a773d54bea43962b92e45348e83ea51d86 100644 (file)
@@ -853,18 +853,12 @@ launchGpuEndOfStepTasks(nonbonded_verlet_t         *nbv,
 
     if (forceWorkload.haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
     {
-        wallcycle_start(wcycle, ewcWAIT_GPU_BONDED);
         // in principle this should be included in the DD balancing region,
         // but generally it is infrequent so we'll omit it for the sake of
         // simpler code
-        gpuBonded->accumulateEnergyTerms(enerd);
-        wallcycle_stop(wcycle, ewcWAIT_GPU_BONDED);
+        gpuBonded->waitAccumulateEnergyTerms(enerd);
 
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
         gpuBonded->clearEnergies();
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
 }
 
@@ -1240,9 +1234,7 @@ void do_force(FILE                                     *fplog,
 
         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
         {
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_BONDED);
             fr->gpuBonded->launchEnergyTransfer();
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         }
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
index 0dce2adb1b19d77b92c55e6d2b98f0da92877b5e..e72e5a850317efe8bfd7dfa6a021e99a7d6fc44f 100644 (file)
@@ -1277,7 +1277,8 @@ int Mdrunner::mdrunner()
                       *hwinfo, nonbondedDeviceInfo,
                       useGpuForBonded,
                       FALSE,
-                      pforce);
+                      pforce,
+                      wcycle);
 
         /* Initialize the mdAtoms structure.
          * mdAtoms is not filled with atom data,