Separate CPU NB kernel and buffer clearing subcounters
authorSzilárd Páll <pall.szilard@gmail.com>
Mon, 13 Nov 2017 22:54:21 +0000 (23:54 +0100)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 28 Jun 2019 22:10:23 +0000 (00:10 +0200)
This is aimed to allow comparing the performance of the pair-interaction
kernels separately from the force buffer clearing.

Change-Id: Ifb2b4b3e5a43ac2ee547da651f9432a22fe58421

src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/kerneldispatch.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index c90dfed17d9c5175fc807170f94d9c867e4a06ab..8cb7e5a2c2134b994d01fdb4ece0d48ee455a3c6 100644 (file)
@@ -350,7 +350,7 @@ static void do_nb_verlet(t_forcerec                       *fr,
         wallcycle_sub_start(wcycle, ewcsNONBONDED);
     }
 
-    nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
+    nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle);
 
     if (!nbv->useGpu())
     {
index eb8905525c8fd9f0113b08d9586b96160f0bb644..52bb7c2e823caf1bd49407a6a9f3db7b5c89c1f7 100644 (file)
@@ -54,6 +54,7 @@
 #include "gromacs/nbnxm/nbnxm_simd.h"
 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
 #include "gromacs/simd/simd.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/real.h"
 
@@ -145,6 +146,7 @@ reduceGroupEnergySimdBuffers(int                       numGroups,
  * \param[in]     clearF        Enum that tells if to clear the force output buffer
  * \param[out]    vCoulomb      Output buffer for Coulomb energies
  * \param[out]    vVdw          Output buffer for Van der Waals energies
+ * \param[in]     wcycle        Pointer to cycle counting data structure.
  */
 static void
 nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
@@ -155,7 +157,8 @@ nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
                  int                             forceFlags,
                  int                             clearF,
                  real                           *vCoulomb,
-                 real                           *vVdw)
+                 real                           *vVdw,
+                 gmx_wallcycle                  *wcycle)
 {
 
     int                      coulkt;
@@ -238,6 +241,7 @@ nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
     gmx::ArrayRef<const NbnxnPairlistCpu> pairlists = pairlistSet.cpuLists();
 
     int gmx_unused                        nthreads = gmx_omp_nthreads_get(emntNonbonded);
+    wallcycle_sub_start(wcycle, ewcsNBFCLEARBUF);
 #pragma omp parallel for schedule(static) num_threads(nthreads)
     for (int nb = 0; nb < pairlists.ssize(); nb++)
     {
@@ -252,6 +256,12 @@ nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
             clear_fshift(out->fshift.data());
         }
 
+        if (nb == 0)
+        {
+            wallcycle_sub_stop(wcycle, ewcsNBFCLEARBUF);
+            wallcycle_sub_start(wcycle, ewcsNBFKERNEL);
+        }
+
         // TODO: Change to reference
         const NbnxnPairlistCpu *pairlist = &pairlists[nb];
 
@@ -383,6 +393,7 @@ nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
             }
         }
     }
+    wallcycle_sub_stop(wcycle, ewcsNBFKERNEL);
 
     if (forceFlags & GMX_FORCE_ENERGY)
     {
@@ -456,7 +467,8 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                                             int                        clearF,
                                             const t_forcerec          &fr,
                                             gmx_enerdata_t            *enerd,
-                                            t_nrnb                    *nrnb)
+                                            t_nrnb                    *nrnb,
+                                            gmx_wallcycle             *wcycle)
 {
     const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
 
@@ -475,7 +487,8 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality
                              enerd->grpp.ener[egCOULSR].data(),
                              fr.bBHAM ?
                              enerd->grpp.ener[egBHAMSR].data() :
-                             enerd->grpp.ener[egLJSR].data());
+                             enerd->grpp.ener[egLJSR].data(),
+                             wcycle);
             break;
 
         case Nbnxm::KernelType::Gpu8x8x8:
index 32ae108d1bb627c48b1c75ffe9a407d8cde55330..3c2d1f6af1a322450d05aa760b4bf5645337b1e1 100644 (file)
@@ -278,7 +278,8 @@ struct nonbonded_verlet_t
                                      int                         clearF,
                                      const t_forcerec           &fr,
                                      gmx_enerdata_t             *enerd,
-                                     t_nrnb                     *nrnb);
+                                     t_nrnb                     *nrnb,
+                                     gmx_wallcycle              *wcycle);
 
         //! Executes the non-bonded free-energy kernel, always runs on the CPU
         void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocality,
index 34a00a67ebcb02ee603bbe3a6516e5bed4ada7b8..ffe8b1eaa2fb3da767ab185274cecc299d3465ba 100644 (file)
@@ -127,6 +127,7 @@ static const char *wcsn[ewcsNR] =
     "Listed buffer ops.",
     "Nonbonded pruning",
     "Nonbonded F",
+    "NB F kernel", "NB F clear buf",
     "Launch NB GPU tasks",
     "Launch Bonded GPU tasks",
     "Launch PME GPU tasks",
index fc8266f750a766d8e3e5a6ea0c5acc112c3fb8b0..58d1e500330d38b94bf5b20957650cc385fed3ff 100644 (file)
@@ -73,6 +73,7 @@ enum {
     ewcsLISTED_BUF_OPS,
     ewcsNONBONDED_PRUNING,
     ewcsNONBONDED,
+    ewcsNBFKERNEL, ewcsNBFCLEARBUF,
     ewcsLAUNCH_GPU_NONBONDED,
     ewcsLAUNCH_GPU_BONDED,
     ewcsLAUNCH_GPU_PME,