Use GpuTimers in CUDA and OpenCL versions of NBNXM directly
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 24 Feb 2021 13:42:23 +0000 (16:42 +0300)
committerArtem Zhmurov <zhmurov@gmail.com>
Wed, 24 Feb 2021 13:42:23 +0000 (16:42 +0300)
The GpuTimers are used through proxy objects in OpenCL and CUDA
versions of NBNXM, which complicates the unification of the code.
This change eliminates the proxy object by using the underlying
object directly.

Refs #2608

src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/gpu_common.h
src/gromacs/nbnxm/gpu_types_common.h
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h

index ccb1fdf26d19f4e6354eeab609a3b756a41109ad..0ddaa17b146e746cd5230692b037e787cc35fd3e 100644 (file)
@@ -457,7 +457,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
-    cu_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -489,7 +489,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
     /* HtoD x, q */
@@ -505,7 +505,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
     }
 
     /* When we get here all misc operations issued in the local stream as well as
@@ -539,7 +539,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cu_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -578,7 +578,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
     }
 
     /* Kernel launch config:
@@ -619,7 +619,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
                 config.sharedMemorySize);
     }
 
-    auto*      timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+    auto*      timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
     const auto kernel =
             select_nbnxn_kernel(nbp->elecType,
                                 nbp->vdwType,
@@ -632,7 +632,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
 
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
     }
 
     if (GMX_NATIVE_WINDOWS)
@@ -660,7 +660,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cu_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -711,7 +711,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     GpuRegionTimer* timer = nullptr;
     if (bDoTime)
     {
-        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+        timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+                                       : timers->interaction[iloc].rollingPrune_k);
     }
 
     /* beginning of timed prune calculation section */
@@ -800,7 +801,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 
     /* extract the data */
     NBAtomData*         adat         = nb->atdat;
-    cu_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     bool                bDoTime      = nb->bDoTime;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
@@ -817,7 +818,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
@@ -884,7 +885,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
     }
 }
 
index 2505422927b705a17a3b6b97f7670d548741110e..f4467c0248a0149f98dc38b33e6e3fdaaa056d9c 100644 (file)
@@ -194,7 +194,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 
     nb->bUseTwoStreams = bLocalAndNonlocal;
 
-    nb->timers = new cu_timers_t();
+    nb->timers = new Nbnxm::GpuTimers();
     snew(nb->timings, 1);
 
     /* init nbst */
@@ -313,7 +313,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     int                  nalloc, natoms;
     bool                 realloced;
     bool                 bDoTime       = nb->bDoTime;
-    cu_timers_t*         timers        = nb->timers;
+    Nbnxm::GpuTimers*    timers        = nb->timers;
     NBAtomData*          d_atdat       = nb->atdat;
     const DeviceContext& deviceContext = *nb->deviceContext_;
     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
index 08d96de90f2b507885214c06002c113e9bfe8486..f3d1bb5acc5d1a9a6ed04b55cd63631fbf3e1a36 100644 (file)
 /*! \brief cluster size = number of atoms per cluster. */
 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
 
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cu_timers_t;
-
 /*! \internal
  * \brief Main data structure for CUDA nonbonded force calculations.
  */
@@ -140,7 +135,7 @@ struct NbnxmGpu
     /*! \brief True if event-based timing is enabled. */
     bool bDoTime = false;
     /*! \brief CUDA event-based timers. */
-    cu_timers_t* timers = nullptr;
+    Nbnxm::GpuTimers* timers = nullptr;
     /*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */
     gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
 };
index 92002bbf4ba12b59e41aaf4e4825f7ffb4e0dfe6..b2663b33f979075c3de2ca684960607d9cafb741 100644 (file)
@@ -193,19 +193,18 @@ static inline gmx::Range<int> getGpuAtomRange(const NBAtomData* atomData, const
  *   - 1st pass prune: ran during the current step (prior to the force kernel);
  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
  *
- * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
- * after calling this function.
+ * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune
+ * should happen after calling this function.
  *
  * \param[in] timers   structs with GPU timer objects
  * \param[inout] timings  GPU task timing data
  * \param[in] iloc        interaction locality
  */
-template<typename GpuTimers>
-static void countPruneKernelTime(GpuTimers*                 timers,
+static void countPruneKernelTime(Nbnxm::GpuTimers*          timers,
                                  gmx_wallclock_gpu_nbnxn_t* timings,
                                  const InteractionLocality  iloc)
 {
-    gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
+    GpuTimers::Interaction& iTimers = timers->interaction[iloc];
 
     // We might have not done any pruning (e.g. if we skipped with empty domains).
     if (!iTimers.didPrune && !iTimers.didRollingPrune)
@@ -281,7 +280,6 @@ static inline void gpu_reduce_staged_outputs(const NBStagingData&      nbst,
  *      counters could end up being inconsistent due to not being incremented
  *      on some of the node when this is skipped on empty local domains!
  *
- * \tparam     GpuTimers         GPU timers type
  * \tparam     GpuPairlist       Pair list type
  * \param[out] timings           Pointer to the NB GPU timings data
  * \param[in]  timers            Pointer to GPU timers data
@@ -291,9 +289,9 @@ static inline void gpu_reduce_staged_outputs(const NBStagingData&      nbst,
  * \param[in]  doTiming          True if timing is enabled.
  *
  */
-template<typename GpuTimers, typename GpuPairlist>
+template<typename GpuPairlist>
 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
-                                          GpuTimers*                 timers,
+                                          Nbnxm::GpuTimers*          timers,
                                           const GpuPairlist*         plist,
                                           AtomLocality               atomLocality,
                                           const gmx::StepWorkload&   stepWork,
index 85c5853ebd4de1cbae1ef6e1209c90ca9ee56035..4b7f0e762e8f079dc878eecbbd3315b5ed1e0b0c 100644 (file)
@@ -205,7 +205,7 @@ using gmx::InteractionLocality;
  * The two-sized arrays hold the local and non-local values and should always
  * be indexed with eintLocal/eintNonlocal.
  */
-struct gpu_timers_t
+struct GpuTimers
 {
     /*! \internal
      * \brief Timers for local or non-local coordinate/force transfers
@@ -244,7 +244,7 @@ struct gpu_timers_t
     //! timers for coordinate/force transfers (every step)
     gmx::EnumerationArray<AtomLocality, XFTransfers> xf;
     //! timers for interaction related transfers
-    gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
+    gmx::EnumerationArray<InteractionLocality, Nbnxm::GpuTimers::Interaction> interaction;
 };
 
 /*! \internal
index aad56f49155f63facabd475f6fad7a6ae94c4e3a..51f7745f7f6791346c7847b8431f81f9823cee1c 100644 (file)
@@ -258,7 +258,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
         }
     }
 
-    gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
+    GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 
     if (bDoTime)
     {
index 50e7b9d8d4f167e1a36eef7134a2047e8b4b7bd5..b8107fc857de911e79f6b48714301118a2a1aba5 100644 (file)
@@ -529,7 +529,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -561,7 +561,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
     /* HtoD x, q */
@@ -573,11 +573,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
                        atomsRange.size(),
                        deviceStream,
                        GpuApiCallBehavior::Async,
-                       bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
+                       bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
     }
 
     /* When we get here all misc operations issued in the local stream as well as
@@ -613,7 +613,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -655,7 +655,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
     }
 
     /* kernel launch config */
@@ -685,7 +685,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
 
     fillin_ocl_structures(nbp, &nbparams_params);
 
-    auto*          timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+    auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
     constexpr char kernelName[] = "k_calc_nb";
     const auto     kernel =
             select_nbnxn_kernel(nb,
@@ -745,7 +745,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
 
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
     }
 }
 
@@ -784,7 +784,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     bool                bDoTime      = nb->bDoTime;
 
@@ -834,7 +834,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     GpuRegionTimer* timer = nullptr;
     if (bDoTime)
     {
-        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+        timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+                                       : timers->interaction[iloc].rollingPrune_k);
     }
 
     /* beginning of timed prune calculation section */
@@ -933,7 +934,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                "beginning of the copy back function.");
 
     NBAtomData*         adat         = nb->atdat;
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     bool                bDoTime      = nb->bDoTime;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
@@ -958,7 +959,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
@@ -978,7 +979,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                          atomsRange.size(),
                          deviceStream,
                          GpuApiCallBehavior::Async,
-                         bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 
     /* kick off work */
     cl_error = clFlush(deviceStream.stream());
@@ -1009,7 +1010,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  SHIFTS,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
@@ -1023,7 +1024,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  1,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
                           "Sizes of host- and device-side electrostatic energy terms should be the "
                           "same.");
@@ -1033,13 +1034,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  1,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
         }
     }
 
     if (bDoTime)
     {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
     }
 }
 
index 25cb3158b2b58f26209bd10cafe5794438c8195d..a2ee20416f4684cfd6468e906a6380466c853a7b 100644 (file)
@@ -297,7 +297,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 
     nb->bUseTwoStreams = bLocalAndNonlocal;
 
-    nb->timers = new cl_timers_t();
+    nb->timers = new Nbnxm::GpuTimers();
     snew(nb->timings, 1);
 
     /* set device info, just point it to the right GPU among the detected ones */
@@ -424,7 +424,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     int                  nalloc, natoms;
     bool                 realloced;
     bool                 bDoTime       = nb->bDoTime;
-    cl_timers_t*         timers        = nb->timers;
+    Nbnxm::GpuTimers*    timers        = nb->timers;
     NBAtomData*          d_atdat       = nb->atdat;
     const DeviceContext& deviceContext = *nb->deviceContext_;
     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
index 474d90700d95d3d6aba8600a571f3f235a43884a..c802c4199fd865f6bdbcf8b8aec940035d5187c9 100644 (file)
@@ -127,11 +127,6 @@ typedef struct cl_nbparam_params
 } cl_nbparam_params_t;
 
 
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cl_timers_t;
-
 /*! \internal
  * \brief Main data structure for OpenCL nonbonded force calculations.
  */
@@ -206,7 +201,7 @@ struct NbnxmGpu
     //! True if event-based timing is enabled.
     bool bDoTime = false;
     //! OpenCL event-based timers.
-    cl_timers_t* timers = nullptr;
+    Nbnxm::GpuTimers* timers = nullptr;
     //! Timing data. TODO: deprecate this and query timers for accumulated data instead
     gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
 };
index c1e23c1a7420f94bf59801e59a12f75f35c26759..fd1d655e3eff3b46b9a6af0b0078efd5c4c62707 100644 (file)
@@ -85,7 +85,7 @@ struct NbnxmGpu
     /*! \brief True if event-based timing is enabled. Always false for SYCL. */
     bool bDoTime = false;
     /*! \brief Dummy timers. */
-    Nbnxm::gpu_timers_t* timers = nullptr;
+    Nbnxm::GpuTimers* timers = nullptr;
     /*! \brief Dummy timing data. */
     gmx_wallclock_gpu_nbnxn_t* timings = nullptr;