From f2d07358eff989ea204972f0290e4991ea0aa1de Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Fri, 20 Oct 2017 22:26:25 +0200
Subject: [PATCH] Merge common nbnxn CUDA/OpenCL GPU wait code-paths

The entire GPU wait including timing accumulation as well as staging
data reducion of the nonbonded GPU modules has been unified by
including a single templated version of the code into the common header.
Code has only been moved and changed in minor ways when necessary (e.g.
for the rvec reduction).

Change-Id: Ic9c9690be58a78f92ca99d2af30068e19c19cc6c
---
 src/gromacs/gpu_utils/cudautils.cuh           |  23 ++
 src/gromacs/gpu_utils/oclutils.h              |  13 ++
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    | 176 +-------------
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |   2 +-
 src/gromacs/mdlib/nbnxn_gpu.h                 |  13 +-
 src/gromacs/mdlib/nbnxn_gpu_common.h          | 219 ++++++++++++++++--
 src/gromacs/mdlib/nbnxn_gpu_common_utils.h    |  68 ++++++
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp     | 182 +--------------
 .../mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp   |   2 +-
 9 files changed, 326 insertions(+), 372 deletions(-)
 create mode 100644 src/gromacs/mdlib/nbnxn_gpu_common_utils.h
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index bea1e4a0bd..feb241d593 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -42,6 +42,8 @@
 #include <nvml.h>
 #endif /* HAVE_NVML */
 
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
 #include "gromacs/utility/fatalerror.h"
 
 /* TODO error checking needs to be rewritten. We have 2 types of error checks needed
@@ -198,4 +200,25 @@ void destroyParamLookupTable(T                         *d_ptr,
                              const struct texture<T, 1, cudaReadModeElementType> *texRef,
                              const gmx_device_info_t   *devInfo);
 
+/*! \brief Add a triplets stored in a float3 to an rvec variable.
+ *
+ * \param[out]  a Rvec to increment
+ * \param[in]   b Float triplet to increment with.
+ */
+static inline void rvec_inc(rvec a, const float3 b)
+{
+    rvec tmp = {b.x, b.y, b.z};
+    rvec_inc(a, tmp);
+}
+
+/*! \brief Calls cudaStreamSynchronize() in the stream \p s.
+ *
+ * \param[in] s stream to synchronize with
+ */
+static inline void gpuStreamSynchronize(cudaStream_t s)
+{
+    cudaError_t stat = cudaStreamSynchronize(s);
+    CU_RET_ERR(stat, "cudaStreamSynchronize failed");
+}
+
 #endif
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 72af59a201..7723fe424c 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -53,6 +53,8 @@
 
 #include <string>
 
+#include "gromacs/utility/gmxassert.h"
+
 /*! \brief OpenCL vendor IDs */
 typedef enum {
     OCL_VENDOR_NVIDIA = 0,
@@ -135,4 +137,15 @@ void ocl_pfree(void *h_ptr);
 /*! \brief Convert error code to diagnostic string */
 std::string ocl_get_error_string(cl_int error);
 
+/*! \brief Calls clFinish() in the stream \p s.
+ *
+ * \param[in] s stream to synchronize with
+ */
+static inline void gpuStreamSynchronize(cl_command_queue s)
+{
+    cl_int cl_error = clFinish(s);
+    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
+                       ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
+}
+
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 36ae7fe481..ad889820e3 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -57,6 +57,7 @@
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_gpu_common.h"
+#include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/timing/gpu_timing.h"
@@ -147,12 +148,6 @@ typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 static const bool bUseCudaLaunchKernel =
     (GMX_CUDA_VERSION >= 7000) && (getenv("GMX_DISABLE_CUDALAUNCH") == NULL);
 
-/* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
-static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
-static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
-static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
-
-
 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
 {
@@ -350,9 +345,6 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
     bool                 bDoTime     = nb->bDoTime;
 
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener;
-
     /* Don't launch the non-local kernel if there is no work to do.
        Doing the same for the local kernel is more complicated, since the
        local part of the force array also depends on the non-local kernel.
@@ -438,7 +430,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     nb_kernel = select_nbnxn_kernel(nbp->eeltype,
                                     nbp->vdwtype,
                                     bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]) || always_prune,
+                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]),
                                     nb->dev_info);
 
     /* Kernel launch config:
@@ -736,170 +728,6 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     }
 }
 
-/*! \brief Count pruning kernel time if either kernel has been triggered
- *
- *  We do the accounting for either of the two pruning kernel flavors:
- *   - 1st pass prune: ran during the current step (prior to the force kernel);
- *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
- *
- * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
- * after calling this function.
- *
- * \param[in] timers      structs with CUDA timer objects
- * \param[inout] timings  GPU task timing data
- * \param[in] iloc        interaction locality
- */
-static void countPruneKernelTime(cu_timers_t               *timers,
-                                 gmx_wallclock_gpu_nbnxn_t *timings,
-                                 const int                  iloc)
-{
-    // We might have not done any pruning (e.g. if we skipped with empty domains).
-    if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
-    {
-        return;
-    }
-
-    if (timers->didPrune[iloc])
-    {
-        timings->pruneTime.c++;
-        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
-    }
-    if (timers->didRollingPrune[iloc])
-    {
-        timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
-    }
-}
-
-/*! \brief Count pruning kernel time if either kernel has been triggered */
-static void nbnxn_gpu_reduce_staged_outputs(nb_staging   nbst,
-                                            int          iLocality,
-                                            bool         reduceEnergies,
-                                            bool         reduceFshift,
-                                            real        *e_lj,
-                                            real        *e_el,
-                                            rvec        *fshift)
-{
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    reduceEnergies = (reduceEnergies || always_ener) && !never_ener;
-
-    /* add up energies and shift forces (only once at local F wait) */
-    if (LOCAL_I(iLocality))
-    {
-        if (reduceEnergies)
-        {
-            *e_lj += *nbst.e_lj;
-            *e_el += *nbst.e_el;
-        }
-
-        if (reduceFshift)
-        {
-            for (int i = 0; i < SHIFTS; i++)
-            {
-                fshift[i][0] += nbst.fshift[i].x;
-                fshift[i][1] += nbst.fshift[i].y;
-                fshift[i][2] += nbst.fshift[i].z;
-            }
-        }
-    }
-}
-
-/*! \brief Do the per-step timing accounting of the nonbonded tasks. */
-static void nbnxn_gpu_accumulate_timings(struct gmx_wallclock_gpu_nbnxn_t *timings,
-                                         cu_timers_t                      *timers,
-                                         const cu_plist_t                 *plist,
-                                         int                               atomLocality,
-                                         bool                              didEnergyKernels,
-                                         bool                              doTiming)
-{
-    /* timing data accumulation */
-    if (!doTiming)
-    {
-        return;
-    }
-
-    /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(atomLocality);
-
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    didEnergyKernels = (didEnergyKernels || always_ener) && !never_ener;
-
-    /* only increase counter once (at local F wait) */
-    if (LOCAL_I(iLocality))
-    {
-        timings->nb_c++;
-        timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
-    }
-
-    /* kernel timings */
-    timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
-        timers->nb_k[iLocality].getLastRangeTime();
-
-    /* X/q H2D and F D2H timings */
-    timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
-    timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
-
-    /* Count the pruning kernel times for both cases:1st pass (at search step)
-       and rolling pruning (if called at the previous step).
-       We do the accounting here as this is the only sync point where we
-       know (without checking or additional sync-ing) that prune tasks in
-       in the current stream have completed (having just blocking-waited
-       for the force D2H). */
-    countPruneKernelTime(timers, timings, iLocality);
-
-    /* only count atdat and pair-list H2D at pair-search step */
-    if (timers->didPairlistH2D[iLocality])
-    {
-        /* atdat transfer timing (add only once, at local F wait) */
-        if (LOCAL_A(atomLocality))
-        {
-            timings->pl_h2d_c++;
-            timings->pl_h2d_t += timers->atdat.getLastRangeTime();
-        }
-
-        timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
-
-        /* Clear the timing flag for the next step */
-        timers->didPairlistH2D[iLocality] = false;
-    }
-
-
-}
-
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
-                            int flags, int aloc,
-                            real *e_lj, real *e_el, rvec *fshift)
-{
-    /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(aloc);
-
-    /* Launch wait/update timers & counters and do reduction into staging buffers
-       BUT skip it when during the non-local phase there was actually no work to do.
-       This is consistent with nbnxn_gpu_launch_kernel.
-
-       NOTE: if timing with multiple GPUs (streams) becomes possible, the
-       counters could end up being inconsistent due to not being incremented
-       on some of the nodes! */
-    if (!canSkipWork(nb, iLocality))
-    {
-        cudaError_t stat = cudaStreamSynchronize(nb->stream[iLocality]);
-        CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
-
-        bool calcEner   = flags & GMX_FORCE_ENERGY;
-        bool calcFshift = flags & GMX_FORCE_VIRIAL;
-
-        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
-
-        nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
-    }
-
-    /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
-    nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
-
-    /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
-    nb->plist[iLocality]->haveFreshList = false;
-}
-
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 {
     return nbfp_texref;
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index e01e2a66fa..cd2f00b5e9 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -52,7 +52,7 @@
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_consts.h"
-#include "gromacs/mdlib/nbnxn_gpu_common.h"
+#include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/md_enums.h"
diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h
index 1b903f1336..cdb68597af 100644
--- a/src/gromacs/mdlib/nbnxn_gpu.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -122,8 +122,19 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
                               int                    gmx_unused         aloc) GPU_FUNC_TERM
 
 /*! \brief
- * Wait for the asynchronously launched nonbonded calculations and data
+ * Wait for the asynchronously launched nonbonded tasks and data
  * transfers to finish.
+ *
+ * Also does timing accounting and reduction of the internal staging buffers.
+ * As this is called at the end of the step, it also resets the pair list and
+ * pruning flags.
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] flags Force flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
  */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
diff --git a/src/gromacs/mdlib/nbnxn_gpu_common.h b/src/gromacs/mdlib/nbnxn_gpu_common.h
index 2b93a015cb..63519e0daa 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_common.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_common.h
@@ -33,9 +33,10 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 /*! \internal \file
- * \brief Implements common routines for different NBNXN GPU implementations
+ * \brief Common functions for the different NBNXN GPU implementations.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
  *
- * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_mdlib
  */
 
@@ -53,21 +54,14 @@
 #if GMX_GPU == GMX_GPU_OPENCL
 #include "nbnxn_ocl/nbnxn_ocl_types.h"
 #endif
+
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/stringutil.h"
 
-
-/*! \brief An early return condition for empty NB GPU workloads
- *
- * This is currently used for non-local kernels/transfers only.
- * Skipping the local kernel is more complicated, since the
- * local part of the force array also depends on the non-local kernel.
- * The skip of the local kernel is taken care of separately.
- */
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc)
-{
-    assert(nb && nb->plist[iloc]);
-    return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0);
-}
+#include "nbnxn_gpu_common_utils.h"
 
 /*! \brief Check that atom locality values are valid for the GPU module.
  *
@@ -142,4 +136,199 @@ static inline void getGpuAtomRange(const AtomDataT *atomData,
     }
 }
 
+
+/*! \brief Count pruning kernel time if either kernel has been triggered
+ *
+ *  We do the accounting for either of the two pruning kernel flavors:
+ *   - 1st pass prune: ran during the current step (prior to the force kernel);
+ *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
+ *
+ * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
+ * after calling this function.
+ *
+ * \param[in] timers   structs with GPU timer objects
+ * \param[inout] timings  GPU task timing data
+ * \param[in] iloc        interaction locality
+ */
+template <typename GpuTimers>
+static void countPruneKernelTime(GpuTimers                 *timers,
+                                 gmx_wallclock_gpu_nbnxn_t *timings,
+                                 const int                  iloc)
+{
+    // We might have not done any pruning (e.g. if we skipped with empty domains).
+    if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
+    {
+        return;
+    }
+
+    if (timers->didPrune[iloc])
+    {
+        timings->pruneTime.c++;
+        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
+    }
+
+    if (timers->didRollingPrune[iloc])
+    {
+        timings->dynamicPruneTime.c++;
+        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
+    }
+}
+
+/*! \brief Reduce data staged internally in the nbnxn module.
+ *
+ * Shift forces and electrostatic/LJ energies copied from the GPU into
+ * a module-internal staging area are immediately reduced (CPU-side buffers passed)
+ * after having waited for the transfers' completion.
+ *
+ * Note that this function should always be called after the transfers into the
+ * staging buffers has completed.
+ *
+ * \tparam     StagingData    Type of staging data
+ * \param[in]  nbst           Nonbonded staging data
+ * \param[in]  iLocality      Interaction locality specifier
+ * \param[in]  reduceEnergies True if energy reduction should be done
+ * \param[in]  reduceFshift   True if shift force reduction should be done
+ * \param[out] e_lj           Variable to accumulate LJ energy into
+ * \param[out] e_el           Variable to accumulate electrostatic energy into
+ * \param[out] fshift         Pointer to the array of shift forces to accumulate into
+ */
+template <typename StagingData>
+static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
+                                                   int                iLocality,
+                                                   bool               reduceEnergies,
+                                                   bool               reduceFshift,
+                                                   real              *e_lj,
+                                                   real              *e_el,
+                                                   rvec              *fshift)
+{
+    /* add up energies and shift forces (only once at local F wait) */
+    if (LOCAL_I(iLocality))
+    {
+        if (reduceEnergies)
+        {
+            *e_lj += *nbst.e_lj;
+            *e_el += *nbst.e_el;
+        }
+
+        if (reduceFshift)
+        {
+            for (int i = 0; i < SHIFTS; i++)
+            {
+                rvec_inc(fshift[i], nbst.fshift[i]);
+            }
+        }
+    }
+}
+
+/*! \brief Do the per-step timing accounting of the nonbonded tasks.
+ *
+ *  Does timing accumulation and call-count increments for the nonbonded kernels.
+ *  Note that this function should be called after the current step's nonbonded
+ *  nonbonded tasks have completed with the exception of the rolling pruning kernels
+ *  that are accounted for during the following step.
+ *
+ * \tparam     GpuTimers         GPU timers type
+ * \tparam     GpuPairlist       Pair list type
+ * \param[out] timings           Pointer to the NB GPU timings data
+ * \param[in]  timers            Pointer to GPU timers data
+ * \param[in]  plist             Pointer to the pair list data
+ * \param[in]  atomLocality      Atom locality specifier
+ * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
+ * \param[in]  doTiming          True if timing is enabled.
+ *
+ */
+template <typename GpuTimers, typename GpuPairlist>
+static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
+                                                GpuTimers                 *timers,
+                                                const GpuPairlist         *plist,
+                                                int                        atomLocality,
+                                                bool                       didEnergyKernels,
+                                                bool                       doTiming)
+{
+    /* timing data accumulation */
+    if (!doTiming)
+    {
+        return;
+    }
+
+    /* determine interaction locality from atom locality */
+    int iLocality = gpuAtomToInteractionLocality(atomLocality);
+
+    /* only increase counter once (at local F wait) */
+    if (LOCAL_I(iLocality))
+    {
+        timings->nb_c++;
+        timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
+    }
+
+    /* kernel timings */
+    timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
+        timers->nb_k[iLocality].getLastRangeTime();
+
+    /* X/q H2D and F D2H timings */
+    timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
+    timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
+
+    /* Count the pruning kernel times for both cases:1st pass (at search step)
+       and rolling pruning (if called at the previous step).
+       We do the accounting here as this is the only sync point where we
+       know (without checking or additional sync-ing) that prune tasks in
+       in the current stream have completed (having just blocking-waited
+       for the force D2H). */
+    countPruneKernelTime(timers, timings, iLocality);
+
+    /* only count atdat and pair-list H2D at pair-search step */
+    if (timers->didPairlistH2D[iLocality])
+    {
+        /* atdat transfer timing (add only once, at local F wait) */
+        if (LOCAL_A(atomLocality))
+        {
+            timings->pl_h2d_c++;
+            timings->pl_h2d_t += timers->atdat.getLastRangeTime();
+        }
+
+        timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
+
+        /* Clear the timing flag for the next step */
+        timers->didPairlistH2D[iLocality] = false;
+    }
+}
+
+// Documented in nbnxn_gpu.h
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
+                            int              flags,
+                            int              aloc,
+                            real            *e_lj,
+                            real            *e_el,
+                            rvec            *fshift)
+{
+    /* determine interaction locality from atom locality */
+    int iLocality = gpuAtomToInteractionLocality(aloc);
+
+    /* Launch wait/update timers & counters and do reduction into staging buffers
+       BUT skip it when during the non-local phase there was actually no work to do.
+       This is consistent with nbnxn_gpu_launch_kernel.
+
+       NOTE: if timing with multiple GPUs (streams) becomes possible, the
+       counters could end up being inconsistent due to not being incremented
+       on some of the nodes! */
+    if (!canSkipWork(nb, iLocality))
+    {
+        gpuStreamSynchronize(nb->stream[iLocality]);
+
+        bool calcEner   = flags & GMX_FORCE_ENERGY;
+        bool calcFshift = flags & GMX_FORCE_VIRIAL;
+
+        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
+
+        nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
+    }
+
+    /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
+    nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
+
+    /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
+    nb->plist[iLocality]->haveFreshList = false;
+}
+
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu_common_utils.h b/src/gromacs/mdlib/nbnxn_gpu_common_utils.h
new file mode 100644
index 0000000000..f6cbe10c57
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_gpu_common_utils.h
@@ -0,0 +1,68 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Implements common util routines for different NBNXN GPU implementations
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_mdlib
+ */
+
+#ifndef GMX_MDLIB_NBNXN_GPU_COMMON_UTILS_H
+#define GMX_MDLIB_NBNXN_GPU_COMMON_UTILS_H
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "nbnxn_cuda/nbnxn_cuda_types.h"
+#endif
+
+#if GMX_GPU == GMX_GPU_OPENCL
+#include "nbnxn_ocl/nbnxn_ocl_types.h"
+#endif
+
+/*! \brief An early return condition for empty NB GPU workloads
+ *
+ * This is currently used for non-local kernels/transfers only.
+ * Skipping the local kernel is more complicated, since the
+ * local part of the force array also depends on the non-local kernel.
+ * The skip of the local kernel is taken care of separately.
+ */
+static inline bool canSkipWork(const gmx_nbnxn_gpu_t *nb, int iloc)
+{
+    assert(nb && nb->plist[iloc]);
+    return (iloc == eintNonlocal) && (nb->plist[iloc]->nsci == 0);
+}
+
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 9a1a40ff0b..c5615ceb00 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -76,6 +76,7 @@
 #include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdlib/nbnxn_gpu.h"
 #include "gromacs/mdlib/nbnxn_gpu_common.h"
+#include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
 #include "gromacs/pbcutil/ishift.h"
@@ -97,12 +98,6 @@ static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
 static const int c_clSize          = c_nbnxnGpuClusterSize;
 //@}
 
-/*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
-//@{
-static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
-static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
-static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
-//@}
 
 /* Uncomment this define to enable kernel debugging */
 //#define DEBUG_OCL
@@ -416,9 +411,6 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     size_t               debug_buffer_size;
 #endif
 
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    bCalcEner = (bCalcEner || always_ener) && !never_ener;
-
     /* Don't launch the non-local kernel if there is no work to do.
        Doing the same for the local kernel is more complicated, since the
        local part of the force array also depends on the non-local kernel.
@@ -517,7 +509,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                                     nbp->eeltype,
                                     nbp->vdwtype,
                                     bCalcEner,
-                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]) || always_prune);
+                                    (plist->haveFreshList && !nb->timers->didPrune[iloc]));
 
     /* kernel launch config */
     local_work_size[0] = c_clSize;
@@ -933,176 +925,6 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
     }
 }
 
-/*! \brief Count pruning kernel time if either kernel has been triggered
- *
- *  We do the accounting for either of the two pruning kernel flavors:
- *   - 1st pass prune: ran during the current step (prior to the force kernel);
- *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
- *
- * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
- * after calling this function.
- *
- * \param[inout] timers   structs with OCL timer objects
- * \param[inout] timings  GPU task timing data
- * \param[in] iloc        interaction locality
- */
-static void countPruneKernelTime(cl_timers_t               *timers,
-                                 gmx_wallclock_gpu_nbnxn_t *timings,
-                                 const int                  iloc)
-{
-    // We might have not done any pruning (e.g. if we skipped with empty domains).
-    if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
-    {
-        return;
-    }
-
-    if (timers->didPrune[iloc])
-    {
-        timings->pruneTime.c++;
-        timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
-    }
-
-    if (timers->didRollingPrune[iloc])
-    {
-        timings->dynamicPruneTime.c++;
-        timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
-    }
-}
-
-/*! \brief Count pruning kernel time if either kernel has been triggered */
-static void nbnxn_gpu_reduce_staged_outputs(const cl_nb_staging   &nbst,
-                                            int                    iLocality,
-                                            bool                   reduceEnergies,
-                                            bool                   reduceFshift,
-                                            real                  *e_lj,
-                                            real                  *e_el,
-                                            rvec                  *fshift)
-{
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    reduceEnergies = (reduceEnergies || always_ener) && !never_ener;
-
-    /* add up energies and shift forces (only once at local F wait) */
-    if (LOCAL_I(iLocality))
-    {
-        if (reduceEnergies)
-        {
-            *e_lj += *nbst.e_lj;
-            *e_el += *nbst.e_el;
-        }
-
-        if (reduceFshift)
-        {
-            for (int i = 0; i < SHIFTS; i++)
-            {
-                fshift[i][0] += (nbst.fshift)[i][0];
-                fshift[i][1] += (nbst.fshift)[i][1];
-                fshift[i][2] += (nbst.fshift)[i][2];
-            }
-        }
-    }
-}
-
-/*! \brief Do the per-step timing accounting of the nonbonded tasks. */
-static void nbnxn_gpu_accumulate_timings(struct gmx_wallclock_gpu_nbnxn_t *timings,
-                                         cl_timers_t                      *timers,
-                                         const cl_plist_t                 *plist,
-                                         int                               atomLocality,
-                                         bool                              didEnergyKernels,
-                                         bool                              doTiming)
-{
-    /* timing data accumulation */
-    if (!doTiming)
-    {
-        return;
-    }
-
-    /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(atomLocality);
-
-    /* turn energy calculation always on/off (for debugging/testing only) */
-    didEnergyKernels = (didEnergyKernels || always_ener) && !never_ener;
-
-    /* only increase counter once (at local F wait) */
-    if (LOCAL_I(iLocality))
-    {
-        timings->nb_c++;
-        timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
-    }
-
-    /* kernel timings */
-    timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
-        timers->nb_k[iLocality].getLastRangeTime();
-
-    /* X/q H2D and F D2H timings */
-    timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
-    timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
-
-    /* Count the pruning kernel times for both cases:1st pass (at search step)
-       and rolling pruning (if called at the previous step).
-       We do the accounting here as this is the only sync point where we
-       know (without checking or additional sync-ing) that prune tasks in
-       in the current stream have completed (having just blocking-waited
-       for the force D2H). */
-    countPruneKernelTime(timers, timings, iLocality);
-
-    /* only count atdat and pair-list H2D at pair-search step */
-    if (timers->didPairlistH2D[iLocality])
-    {
-        /* atdat transfer timing (add only once, at local F wait) */
-        if (LOCAL_A(atomLocality))
-        {
-            timings->pl_h2d_c++;
-            timings->pl_h2d_t += timers->atdat.getLastRangeTime();
-        }
-
-        timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
-
-        /* Clear the timing flag for the next step */
-        timers->didPairlistH2D[iLocality] = false;
-    }
-
-
-}
-
-/*! \brief
- * Wait for the asynchronously launched nonbonded calculations and data
- * transfers to finish.
- */
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
-                            int flags, int aloc,
-                            real *e_lj, real *e_el, rvec *fshift)
-{
-    /* determine interaction locality from atom locality */
-    int iLocality = gpuAtomToInteractionLocality(aloc);
-
-    /* Launch wait/update timers & counters and do reduction into staging buffers
-       BUT skip it when during the non-local phase there was actually no work to do.
-       This is consistent with nbnxn_gpu_launch_kernel.
-
-       NOTE: if timing with multiple GPUs (streams) becomes possible, the
-       counters could end up being inconsistent due to not being incremented
-       on some of the nodes! */
-    if (!canSkipWork(nb, iLocality))
-    {
-        /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
-        cl_int gmx_unused cl_error = clFinish(nb->stream[iLocality]);
-        assert(CL_SUCCESS == cl_error);
-
-        bool calcEner   = flags & GMX_FORCE_ENERGY;
-        bool calcFshift = flags & GMX_FORCE_VIRIAL;
-
-        nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, nb->bDoTime);
-
-        nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
-
-    }
-
-    /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
-    nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
-
-    /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
-    nb->plist[iLocality]->haveFreshList = false;
-}
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
index b81e2965fa..6174281e3d 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -58,7 +58,7 @@
 #include "gromacs/mdlib/nb_verlet.h"
 #include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdlib/nbnxn_gpu.h"
-#include "gromacs/mdlib/nbnxn_gpu_common.h"
+#include "gromacs/mdlib/nbnxn_gpu_common_utils.h"
 #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
 #include "gromacs/mdtypes/interaction_const.h"
-- 
2.22.0