From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Thu, 9 Nov 2017 18:01:43 +0000 (+0100)
Subject: Rename and expose "generic" GPU memory transfer functions
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a723f4a28689193a79f8887f5eea18c27882a72f;p=alexxy%2Fgromacs.git

Rename and expose "generic" GPU memory transfer functions

Dropped the "_generic" suffix from the names. Made the sync/async
argument an enum class instead of boolean.
Made PME use synchronous versions of the functions for unit tests.

Change-Id: I5fd2490d58370d9f0405aea1a74237fa8107cbab
---

diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
index f4512c4204..6d1b88414e 100644
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -544,7 +544,8 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
 
 /*! \libinternal \brief
  * Enables or disables the testing mode.
- * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
+ * and also makes the copies synchronous.
  *
  * \param[in] pmeGPU             The PME GPU structure.
  * \param[in] testing            Should the testing mode be enabled, or disabled.
@@ -552,6 +553,7 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
 gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing)
 {
     pmeGPU->settings.copyAllOutputs = testing;
+    pmeGPU->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
 }
 
 /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h
index e95fa882cc..4e768410b7 100644
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -59,6 +59,7 @@
 #include <vector>
 
 #include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/basedefinitions.h"
 
@@ -242,9 +243,11 @@ struct PmeGpuSettings
     /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
      * Only intended to be used by the test framework.
      */
-    bool copyAllOutputs;
+    bool               copyAllOutputs;
+    /*! \brief An enum which tells whether most PME GPU D2H/H2D data transfers should be synchronous. */
+    GpuApiCallBehavior transferKind;
     /*! \brief Various flags for the current PME computation, corresponding to the GMX_PME_ flags in pme.h. */
-    int  currentFlags;
+    int                currentFlags;
 };
 
 /*! \internal \brief
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu
index bf4db7b7a2..eceac20286 100644
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -427,7 +427,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
 
     if (copyInputAndOutputGrid)
     {
-        cu_copy_H2D_async(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+        cu_copy_H2D(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+                    pmeGpu->settings.transferKind, stream);
     }
 
     int majorDim = -1, middleDim = -1, minorDim = -1;
@@ -489,12 +490,13 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
 
     if (computeEnergyAndVirial)
     {
-        cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
-                          c_virialAndEnergyCount * sizeof(float), stream);
+        cu_copy_D2H(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
+                    c_virialAndEnergyCount * sizeof(float), pmeGpu->settings.transferKind, stream);
     }
 
     if (copyInputAndOutputGrid)
     {
-        cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+        cu_copy_D2H(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+                    pmeGpu->settings.transferKind, stream);
     }
 }
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
index ca18afc6c4..a948730cde 100644
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -126,8 +126,8 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGPU)
         memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
     }
     /* TODO: pin original buffer instead! */
-    cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
-                      newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
+                newSplineValuesSize * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 }
 
 void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU)
@@ -155,7 +155,7 @@ void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces)
     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
     const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 }
 
 void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
@@ -163,7 +163,7 @@ void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
     const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
+    cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 }
 
 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
@@ -191,8 +191,8 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGPU, const rvec *h_coordina
     GMX_RELEASE_ASSERT(false, "Only single precision is supported");
     GMX_UNUSED_VALUE(h_coordinates);
 #else
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
-                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
+                pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 #endif
 }
 
@@ -209,8 +209,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGPU, const flo
     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
                         &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
                         newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
-                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
+                pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
     if (c_usePadding)
     {
         const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
@@ -376,13 +376,13 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGPU)
 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
 {
     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
-    cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 }
 
 void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
 {
     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
-    cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
     cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
     CU_RET_ERR(stat, "PME spread grid sync event record failure");
 }
@@ -393,10 +393,10 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
-    cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
-                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+                kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 }
 
 void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
@@ -417,10 +417,10 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
                    "PME failed to clear the spline values");
     }
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
-                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
+                kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 }
 
 void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU)
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu
index caba44f3cc..38409c39ad 100644
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,16 +41,15 @@
 #include <cstdlib>
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/smalloc.h"
 
 /*** Generic CUDA data operation wrappers ***/
 
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
 {
     cudaError_t stat;
 
@@ -59,16 +58,20 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
         return -1;
     }
 
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
-        CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
-
-    }
-    else
+    switch (transferKind)
     {
-        stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
-        CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
+            CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
+            CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
     }
 
     return 0;
@@ -76,7 +79,7 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
 
 int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
 {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
 }
 
 /*!
@@ -84,15 +87,12 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  */
 int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
 {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
 }
 
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
 {
     cudaError_t stat;
 
@@ -101,15 +101,20 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
         return -1;
     }
 
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
-        CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
-    }
-    else
+    switch (transferKind)
     {
-        stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
-        CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
+            CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
+            CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
     }
 
     return 0;
@@ -117,7 +122,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
 
 int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
 {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
 }
 
 /*!
@@ -125,7 +130,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
  */
 int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
 {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 }
 
 /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index b05cf65f99..f8d73c9697 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -46,6 +46,8 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/fatalerror.h"
 
+enum class GpuApiCallBehavior;
+
 /* TODO error checking needs to be rewritten. We have 2 types of error checks needed
    based on where they occur in the code:
    - non performance-critical: these errors are unsafe to be ignored and must be
@@ -134,6 +136,11 @@ struct gmx_device_info_t
 #endif                                           /* HAVE_NVML */
 };
 
+/*! Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/);
 
 /*! Launches synchronous host to device memory copy in stream 0. */
 int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
@@ -141,6 +148,12 @@ int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
 /*! Launches asynchronous host to device memory copy in stream s. */
 int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
 
+/*! Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/);
+
 /*! Launches synchronous host to device memory copy. */
 int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
 
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index c85b1bed66..ad2297eae0 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -60,6 +60,13 @@ namespace gmx
 class MDLogger;
 }
 
+//! Enum which is only used to describe transfer calls at the moment
+enum class GpuApiCallBehavior
+{
+    Sync,
+    Async
+};
+
 /*! \brief Detect all GPUs in the system.
  *
  *  Will detect every GPU supported by the device driver in use. Also
diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp
index 384242f827..eb270c0d1d 100644
--- a/src/gromacs/gpu_utils/oclutils.cpp
+++ b/src/gromacs/gpu_utils/oclutils.cpp
@@ -48,20 +48,15 @@
 
 #include <string>
 
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 
-/*! \brief Launches synchronous or asynchronous host to device memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular host to device operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
-                                size_t offset, size_t bytes,
-                                bool bAsync /* = false*/,
-                                cl_command_queue command_queue,
-                                cl_event *copy_event)
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event)
 {
     cl_int gmx_unused cl_error;
 
@@ -70,17 +65,22 @@ static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
         return -1;
     }
 
-    if (bAsync)
+    switch (transferKind)
     {
-        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
-    }
-    else
-    {
-        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
+        case GpuApiCallBehavior::Async:
+            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        default:
+            throw;
     }
 
     return 0;
@@ -97,7 +97,7 @@ int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
                        cl_command_queue command_queue,
                        cl_event *copy_event)
 {
-    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
 }
 
 /*! \brief Launches synchronous host to device memory copy.
@@ -106,20 +106,14 @@ int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src,
                       size_t offset, size_t bytes,
                       cl_command_queue command_queue)
 {
-    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, NULL);
 }
 
-/*! \brief Launches synchronous or asynchronous device to host memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular device to host operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
-                                size_t offset, size_t bytes,
-                                bool bAsync,
-                                cl_command_queue command_queue,
-                                cl_event *copy_event)
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event)
 {
     cl_int gmx_unused cl_error;
 
@@ -128,17 +122,22 @@ static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
         return -1;
     }
 
-    if (bAsync)
+    switch (transferKind)
     {
-        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
-    }
-    else
-    {
-        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
+        case GpuApiCallBehavior::Async:
+            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        default:
+            throw;
     }
 
     return 0;
@@ -155,7 +154,7 @@ int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
                        cl_command_queue command_queue,
                        cl_event *copy_event)
 {
-    return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+    return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
 }
 
 /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 232ad4f5fe..184eb6dbfb 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -46,6 +46,8 @@
 #include "gromacs/gpu_utils/gmxopencl.h"
 #include "gromacs/utility/gmxassert.h"
 
+enum class GpuApiCallBehavior;
+
 /*! \brief OpenCL vendor IDs */
 typedef enum {
     OCL_VENDOR_NVIDIA = 0,
@@ -101,15 +103,39 @@ struct gmx_device_runtime_data_t
     cl_program program; /**< OpenCL program */
 };
 
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular device to host operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event);
 
-/*! \brief Launches asynchronous host to device memory copy. */
-int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
                        size_t offset, size_t bytes,
                        cl_command_queue command_queue,
                        cl_event *copy_event);
 
-/*! \brief Launches asynchronous device to host memory copy. */
-int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event);
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
                        size_t offset, size_t bytes,
                        cl_command_queue command_queue,
                        cl_event *copy_event);