Rename and expose "generic" GPU memory transfer functions

[alexxy/gromacs.git] / src / gromacs / gpu_utils / cudautils.cu
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu

index caba44f3cc7a19344199622d53c986a8086fc1ba..38409c39ad9f99325e7be48ee5d7bc7983dd7729 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,16 +41,15 @@
  #include <cstdlib>
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
  
  /*** Generic CUDA data operation wrappers ***/
  
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  {
      cudaError_t stat;
  
@@ -59,16 +58,20 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
          return -1;
      }
  
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
-        CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
-
-    }
-    else
+    switch (transferKind)
      {
-        stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
-        CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
+            CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
+            CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -76,7 +79,7 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  
  int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
  }
  
  /*!
@@ -84,15 +87,12 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
   */
  int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
  }
  
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  {
      cudaError_t stat;
  
@@ -101,15 +101,20 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
          return -1;
      }
  
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
-        CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
-    }
-    else
+    switch (transferKind)
      {
-        stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
-        CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
+            CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
+            CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -117,7 +122,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
  
  int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
  {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
  }
  
  /*!
@@ -125,7 +130,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
   */
  int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
  {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
  }
  
  /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/