Store wcycle pointer in nonbonded_verlet_t

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 29 Jul 2019 16:24:46 +0000 (18:24 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Wed, 31 Jul 2019 11:21:13 +0000 (13:21 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 29 Jul 2019 16:24:46 +0000 (18:24 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Wed, 31 Jul 2019 11:21:13 +0000 (13:21 +0200)
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index af151e519d316417449bd02bf0605a768457a740..db5bd9fd1df02e0e58eb6f252a0e78239fe155e8 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1980,7 +1980,7 @@ void init_forcerec(FILE                             *fp,
  
          fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr,
                                          cr, hardwareInfo, deviceInfo,
-                                        mtop, box);
+                                        mtop, box, wcycle);
  
          if (useGpuForBonded)
          {
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index e525fad2974fb2fda89edbbc6b9bca158520d32c..70a25665a2c574cfcbdc6258dd1d77655e5653d7 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -346,7 +346,7 @@ static void do_nb_verlet(t_forcerec                       *fr,
          }
      }
  
-    nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle);
+    nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
  }
  
  static inline void clear_rvecs_omp(int n, rvec v[])
@@ -693,8 +693,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                  nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
                                                as_rvec_array(force->unpaddedArrayRef().data()),
                                                BufferOpsUseGpu::False,
-                                              GpuBufferOpsAccumulateForce::Null,
-                                              wcycle);
+                                              GpuBufferOpsAccumulateForce::Null);
              }
          }
      }
@@ -831,11 +830,7 @@ launchGpuEndOfStepTasks(nonbonded_verlet_t         *nbv,
           */
          if (nbv->isDynamicPruningStepGpu(step))
          {
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
              nbv->dispatchPruneKernelGpu(step);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
          }
  
          /* now clear the GPU outputs while we finish the step on the CPU */
@@ -1108,13 +1103,13 @@ void do_force(FILE                                     *fplog,
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
          {
-            nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle);
+            nbv->atomdata_init_add_nbat_f_to_f_gpu();
          }
      }
      else
      {
          nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
-                            x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+                            x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
      }
  
      if (bUseGPU)
@@ -1181,7 +1176,7 @@ void do_force(FILE                                     *fplog,
              dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
  
              nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
-                                x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+                                x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
  
          }
  
@@ -1318,14 +1313,14 @@ void do_force(FILE                                     *fplog,
          nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
                                        fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
                                        inputrec->fepvals, lambda.data(),
-                                      enerd, flags, nrnb, wcycle);
+                                      enerd, flags, nrnb);
  
          if (havePPDomainDecomposition(cr))
          {
              nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
                                            fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
                                            inputrec->fepvals, lambda.data(),
-                                          enerd, flags, nrnb, wcycle);
+                                          enerd, flags, nrnb);
          }
      }
  
@@ -1344,8 +1339,7 @@ void do_force(FILE                                     *fplog,
          wallcycle_stop(wcycle, ewcFORCE);
          nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
                                        BufferOpsUseGpu::False,
-                                      GpuBufferOpsAccumulateForce::Null,
-                                      wcycle);
+                                      GpuBufferOpsAccumulateForce::Null);
  
          wallcycle_start_nocount(wcycle, ewcFORCE);
  
@@ -1427,7 +1421,7 @@ void do_force(FILE                                     *fplog,
                  nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
              }
              nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
-                                          forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+                                          forceOut.f, useGpuFBufOps, accumulateForce);
              if (useGpuFBufOps == BufferOpsUseGpu::True)
              {
                  nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
@@ -1536,7 +1530,7 @@ void do_force(FILE                                     *fplog,
              nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
          }
          nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
-                                      forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+                                      forceOut.f, useGpuFBufOps, accumulateForce);
          if (useGpuFBufOps == BufferOpsUseGpu::True)
          {
              nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp

index 99b96d11b6fce7fd9c8ac9c2fd2187e958841873..a600029a2b14977bbe062eda09dfe796b4ef5640 100644 (file)
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -467,8 +467,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                                              int                        clearF,
                                              const t_forcerec          &fr,
                                              gmx_enerdata_t            *enerd,
-                                            t_nrnb                    *nrnb,
-                                            gmx_wallcycle             *wcycle)
+                                            t_nrnb                    *nrnb)
  {
      const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
  
@@ -488,7 +487,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                               fr.bBHAM ?
                               enerd->grpp.ener[egBHAMSR].data() :
                               enerd->grpp.ener[egLJSR].data(),
-                             wcycle);
+                             wcycle_);
              break;
  
          case Nbnxm::KernelType::Gpu8x8x8:
@@ -527,8 +526,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
                                               real                       *lambda,
                                               gmx_enerdata_t             *enerd,
                                               const int                   forceFlags,
-                                             t_nrnb                     *nrnb,
-                                             gmx_wallcycle              *wcycle)
+                                             t_nrnb                     *nrnb)
  {
      const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
  
@@ -567,7 +565,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
  
      GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads");
  
-    wallcycle_sub_start(wcycle, ewcsNONBONDED_FEP);
+    wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
  #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
      for (int th = 0; th < nbl_fep.ssize(); th++)
      {
@@ -624,5 +622,5 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocali
              enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
          }
      }
-    wallcycle_sub_stop(wcycle, ewcsNONBONDED_FEP);
+    wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
  }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 681dcf2d8bbc8c36ee72743ca25b25d5810ccbbc..77119044cbcd08cc30362feb737b8c6e5ce4d0ea 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -136,11 +136,10 @@ void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality       locality
                                          const bool                      fillLocal,
                                          gmx::ArrayRef<const gmx::RVec>  x,
                                          BufferOpsUseGpu                 useGpu,
-                                        void                           *xPmeDevicePtr,
-                                        gmx_wallcycle                  *wcycle)
+                                        void                           *xPmeDevicePtr)
  {
-    wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
  
      auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
          nbnxn_atomdata_copy_x_to_nbat_x<true> :
@@ -150,8 +149,8 @@ void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality       locality
            as_rvec_array(x.data()),
            nbat.get(), gpu_nbv, xPmeDevicePtr);
  
-    wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
  gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
@@ -163,8 +162,7 @@ void
  nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality           locality,
                                               rvec                               *f,
                                               BufferOpsUseGpu                     useGpu,
-                                             GpuBufferOpsAccumulateForce         accumulateForce,
-                                             gmx_wallcycle                      *wcycle)
+                                             GpuBufferOpsAccumulateForce         accumulateForce)
  {
  
      GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
@@ -178,22 +176,22 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
          return;
      }
  
-    wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
      auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
      fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
  
-    wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
  void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
  {
  
-    wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
  
      const Nbnxm::GridSet      &gridSet = pairSearch_->gridSet();
  
@@ -201,8 +199,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
                                            gpu_nbv,
                                            gridSet.numRealAtomsTotal());
  
-    wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
  real nonbonded_verlet_t::pairlistInnerRadius() const
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 7a6491a4aac4538d34e0819d022b94a12c4fdbce..a749b074b9f468f8333519cf79799705b984f4a5 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -212,7 +212,8 @@ struct nonbonded_verlet_t
                             std::unique_ptr<PairSearch>        pairSearch,
                             std::unique_ptr<nbnxn_atomdata_t>  nbat,
                             const Nbnxm::KernelSetup          &kernelSetup,
-                           gmx_nbnxn_gpu_t                   *gpu_nbv);
+                           gmx_nbnxn_gpu_t                   *gpu_nbv,
+                           gmx_wallcycle                     *wcycle);
  
          ~nonbonded_verlet_t();
  
@@ -261,8 +262,7 @@ struct nonbonded_verlet_t
                              bool                            fillLocal,
                              gmx::ArrayRef<const gmx::RVec>  x,
                              BufferOpsUseGpu                 useGpu,
-                            void                           *xPmeDevicePtr,
-                            gmx_wallcycle                  *wcycle);
+                            void                           *xPmeDevicePtr);
  
          //! Init for GPU version of setup coordinates in Nbnxm
          void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -296,8 +296,7 @@ struct nonbonded_verlet_t
                                       int                         clearF,
                                       const t_forcerec           &fr,
                                       gmx_enerdata_t             *enerd,
-                                     t_nrnb                     *nrnb,
-                                     gmx_wallcycle              *wcycle);
+                                     t_nrnb                     *nrnb);
  
          //! Executes the non-bonded free-energy kernel, always runs on the CPU
          void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocality,
@@ -309,18 +308,16 @@ struct nonbonded_verlet_t
                                        real                       *lambda,
                                        gmx_enerdata_t             *enerd,
                                        int                         forceFlags,
-                                      t_nrnb                     *nrnb,
-                                      gmx_wallcycle              *wcycle);
+                                      t_nrnb                     *nrnb);
  
          //! Add the forces stored in nbat to f, zeros the forces in nbat */
          void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality                 locality,
                                        rvec                               *f,
                                        BufferOpsUseGpu                     useGpu,
-                                      GpuBufferOpsAccumulateForce         accumulateForce,
-                                      gmx_wallcycle                      *wcycle);
+                                      GpuBufferOpsAccumulateForce         accumulateForce);
  
          /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
-        void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle);
+        void atomdata_init_add_nbat_f_to_f_gpu();
  
          /*! \brief H2D transfer of force buffer*/
          void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
@@ -378,6 +375,8 @@ struct nonbonded_verlet_t
      private:
          //! The non-bonded setup, also affects the pairlist construction kernel
          Nbnxm::KernelSetup                kernelSetup_;
+        //! \brief Pointer to wallcycle structure.
+        gmx_wallcycle                    *wcycle_;
      public:
          //! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr)
          gmx_nbnxn_gpu_t                  *gpu_nbv;
@@ -396,7 +395,8 @@ init_nb_verlet(const gmx::MDLogger     &mdlog,
                 const gmx_hw_info_t     &hardwareInfo,
                 const gmx_device_info_t *deviceInfo,
                 const gmx_mtop_t        *mtop,
-               matrix                   box);
+               matrix                   box,
+               gmx_wallcycle           *wcycle);
  
  } // namespace Nbnxm
  
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp

index 18c8182dd5c944865856034b83581eaa51bcb7db..6715fef9c2c5cca5936fbd9d140159110c7896ac 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -364,7 +364,8 @@ init_nb_verlet(const gmx::MDLogger     &mdlog,
                 const gmx_hw_info_t     &hardwareInfo,
                 const gmx_device_info_t *deviceInfo,
                 const gmx_mtop_t        *mtop,
-               matrix                   box)
+               matrix                   box,
+               gmx_wallcycle           *wcycle)
  {
      const bool          emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
      const bool          useGpu     = deviceInfo != nullptr;
@@ -482,7 +483,8 @@ init_nb_verlet(const gmx::MDLogger     &mdlog,
                                                  std::move(pairSearch),
                                                  std::move(nbat),
                                                  kernelSetup,
-                                                gpu_nbv);
+                                                gpu_nbv,
+                                                wcycle);
  }
  
  } // namespace Nbnxm
@@ -491,11 +493,13 @@ nonbonded_verlet_t::nonbonded_verlet_t(std::unique_ptr<PairlistSets>      pairli
                                         std::unique_ptr<PairSearch>        pairSearch,
                                         std::unique_ptr<nbnxn_atomdata_t>  nbat_in,
                                         const Nbnxm::KernelSetup          &kernelSetup,
-                                       gmx_nbnxn_gpu_t                   *gpu_nbv_ptr) :
+                                       gmx_nbnxn_gpu_t                   *gpu_nbv_ptr,
+                                       gmx_wallcycle                     *wcycle) :
      pairlistSets_(std::move(pairlistSets)),
      pairSearch_(std::move(pairSearch)),
      nbat(std::move(nbat_in)),
      kernelSetup_(kernelSetup),
+    wcycle_(wcycle),
      gpu_nbv(gpu_nbv_ptr)
  {
      GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets");
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp

index 899b9489be6af794447aebaebf98b4fac1f85bd9..cc5e3d08d909826a632522236004e85e5e39004b 100644 (file)
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -37,6 +37,7 @@
  
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
  #include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/gmxassert.h"
  
  #include "clusterdistancekerneltype.h"
@@ -97,9 +98,15 @@ nonbonded_verlet_t::dispatchPruneKernelCpu(const Nbnxm::InteractionLocality  iLo
  
  void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
  {
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+
      const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0);
  
      Nbnxm::gpu_launch_kernel_pruneonly(gpu_nbv,
                                         stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
                                         pairlistSets().params().numRollingPruningParts);
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 29 Jul 2019 16:24:46 +0000 (18:24 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 31 Jul 2019 11:21:13 +0000 (13:21 +0200)
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/kerneldispatch.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_setup.cpp		patch \| blob \| history
src/gromacs/nbnxm/prunekerneldispatch.cpp		patch \| blob \| history