Rename and expose "generic" GPU memory transfer functions

author Aleksei Iupinov <a.yupinov@gmail.com>

Thu, 9 Nov 2017 18:01:43 +0000 (19:01 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 15 Nov 2017 08:35:31 +0000 (09:35 +0100)
author Aleksei Iupinov <a.yupinov@gmail.com>
Thu, 9 Nov 2017 18:01:43 +0000 (19:01 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 15 Nov 2017 08:35:31 +0000 (09:35 +0100)
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index f4512c420447de8937926bd53028267889b55335..6d1b88414e334144480400b52d5cd7d50bd76c01 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -544,7 +544,8 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
  
  /*! \libinternal \brief
   * Enables or disables the testing mode.
- * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
+ * and also makes the copies synchronous.
   *
   * \param[in] pmeGPU             The PME GPU structure.
   * \param[in] testing            Should the testing mode be enabled, or disabled.
@@ -552,6 +553,7 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU)
  gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing)
  {
      pmeGPU->settings.copyAllOutputs = testing;
+    pmeGPU->settings.transferKind   = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
  }
  
  /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index e95fa882ccd747f63555ea9cbd7bb7a0e5cee019..4e768410b79872387ad44947cdc02bc989e67bc1 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -59,6 +59,7 @@
  #include <vector>
  
  #include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/basedefinitions.h"
  
@@ -242,9 +243,11 @@ struct PmeGpuSettings
      /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
       * Only intended to be used by the test framework.
       */
-    bool copyAllOutputs;
+    bool               copyAllOutputs;
+    /*! \brief An enum which tells whether most PME GPU D2H/H2D data transfers should be synchronous. */
+    GpuApiCallBehavior transferKind;
      /*! \brief Various flags for the current PME computation, corresponding to the GMX_PME_ flags in pme.h. */
-    int  currentFlags;
+    int                currentFlags;
  };
  
  /*! \internal \brief
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu

index bf4db7b7a2bd4130c35499b15d505ddbd871c1a5..eceac2028605708f65ba0222dca0d6e3a42de9a7 100644 (file)
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -427,7 +427,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
  
      if (copyInputAndOutputGrid)
      {
-        cu_copy_H2D_async(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+        cu_copy_H2D(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+                    pmeGpu->settings.transferKind, stream);
      }
  
      int majorDim = -1, middleDim = -1, minorDim = -1;
@@ -489,12 +490,13 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
  
      if (computeEnergyAndVirial)
      {
-        cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
-                          c_virialAndEnergyCount * sizeof(float), stream);
+        cu_copy_D2H(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
+                    c_virialAndEnergyCount * sizeof(float), pmeGpu->settings.transferKind, stream);
      }
  
      if (copyInputAndOutputGrid)
      {
-        cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+        cu_copy_D2H(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+                    pmeGpu->settings.transferKind, stream);
      }
  }
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index ca18afc6c475889449e9dc17082309cca64d0224..a948730cde46e377cebe3943b7c8452b68eb48c3 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -126,8 +126,8 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGPU)
          memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
      }
      /* TODO: pin original buffer instead! */
-    cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
-                      newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
+                newSplineValuesSize * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  }
  
  void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU)
@@ -155,7 +155,7 @@ void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces)
      GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
      const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
      GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  }
  
  void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
@@ -163,7 +163,7 @@ void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
      GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
      const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
      GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
+    cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  }
  
  void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
@@ -191,8 +191,8 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGPU, const rvec *h_coordina
      GMX_RELEASE_ASSERT(false, "Only single precision is supported");
      GMX_UNUSED_VALUE(h_coordinates);
  #else
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
-                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
+                pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  #endif
  }
  
@@ -209,8 +209,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGPU, const flo
      cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
                          &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
                          newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
-    cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
-                      pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
+                pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
      if (c_usePadding)
      {
          const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
@@ -376,13 +376,13 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGPU)
  void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
  {
      const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
-    cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
  }
  
  void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
  {
      const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
-    cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
      cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
      CU_RET_ERR(stat, "PME spread grid sync event record failure");
  }
@@ -393,10 +393,10 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
      const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
      const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
      auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
-    cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
-                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+                kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
  }
  
  void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
@@ -417,10 +417,10 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
          CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
                     "PME failed to clear the spline values");
      }
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
-    cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
-                      kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+    cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
+                kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
  }
  
  void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU)
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu

index caba44f3cc7a19344199622d53c986a8086fc1ba..38409c39ad9f99325e7be48ee5d7bc7983dd7729 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -41,16 +41,15 @@
  #include <cstdlib>
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
  
  /*** Generic CUDA data operation wrappers ***/
  
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  {
      cudaError_t stat;
  
@@ -59,16 +58,20 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
          return -1;
      }
  
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
-        CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
-
-    }
-    else
+    switch (transferKind)
      {
-        stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
-        CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
+            CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
+            CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -76,7 +79,7 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
  
  int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
  {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
  }
  
  /*!
@@ -84,15 +87,12 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
   */
  int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
  {
-    return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
+    return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
  }
  
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- *  The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
-                               bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
+                GpuApiCallBehavior transferKind, cudaStream_t s = 0)
  {
      cudaError_t stat;
  
@@ -101,15 +101,20 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
          return -1;
      }
  
-    if (bAsync)
-    {
-        stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
-        CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
-    }
-    else
+    switch (transferKind)
      {
-        stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
-        CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
+            CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
+            CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -117,7 +122,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
  
  int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
  {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
  }
  
  /*!
@@ -125,7 +130,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
   */
  int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
  {
-    return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
+    return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
  }
  
  /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index b05cf65f997b8f773566ef76094a21a444b4710b..f8d73c96979688d7b56bde47871635e21b1c27f3 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -46,6 +46,8 @@
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/fatalerror.h"
  
+enum class GpuApiCallBehavior;
+
  /* TODO error checking needs to be rewritten. We have 2 types of error checks needed
     based on where they occur in the code:
     - non performance-critical: these errors are unsafe to be ignored and must be
@@ -134,6 +136,11 @@ struct gmx_device_info_t
  #endif                                           /* HAVE_NVML */
  };
  
+/*! Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/);
  
  /*! Launches synchronous host to device memory copy in stream 0. */
  int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
@@ -141,6 +148,12 @@ int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
  /*! Launches asynchronous host to device memory copy in stream s. */
  int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
  
+/*! Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/);
+
  /*! Launches synchronous host to device memory copy. */
  int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
  
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index c85b1bed668c4699d1b0da42d5ee6bd25aeb43e0..ad2297eae05f8cd33c4d86433aa22321e749e5cd 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -60,6 +60,13 @@ namespace gmx
  class MDLogger;
  }
  
+//! Enum which is only used to describe transfer calls at the moment
+enum class GpuApiCallBehavior
+{
+    Sync,
+    Async
+};
+
  /*! \brief Detect all GPUs in the system.
   *
   *  Will detect every GPU supported by the device driver in use. Also
diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp

index 384242f82743230c7aa2bb17ca29d30f9b349ecd..eb270c0d1d547ddfef4fb839b21a6f5c05cf0e9a 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.cpp
+++ b/src/gromacs/gpu_utils/oclutils.cpp
@@ -48,20 +48,15 @@
  
  #include <string>
  
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
  
-/*! \brief Launches synchronous or asynchronous host to device memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular host to device operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
-                                size_t offset, size_t bytes,
-                                bool bAsync /* = false*/,
-                                cl_command_queue command_queue,
-                                cl_event *copy_event)
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event)
  {
      cl_int gmx_unused cl_error;
  
@@ -70,17 +65,22 @@ static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
          return -1;
      }
  
-    if (bAsync)
+    switch (transferKind)
      {
-        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
-    }
-    else
-    {
-        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
+        case GpuApiCallBehavior::Async:
+            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -97,7 +97,7 @@ int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
                         cl_command_queue command_queue,
                         cl_event *copy_event)
  {
-    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
  }
  
  /*! \brief Launches synchronous host to device memory copy.
@@ -106,20 +106,14 @@ int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src,
                        size_t offset, size_t bytes,
                        cl_command_queue command_queue)
  {
-    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+    return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, NULL);
  }
  
-/*! \brief Launches synchronous or asynchronous device to host memory copy.
- *
- *  If copy_event is not NULL, on return it will contain an event object
- *  identifying this particular device to host operation. The event can further
- *  be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
-                                size_t offset, size_t bytes,
-                                bool bAsync,
-                                cl_command_queue command_queue,
-                                cl_event *copy_event)
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event)
  {
      cl_int gmx_unused cl_error;
  
@@ -128,17 +122,22 @@ static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
          return -1;
      }
  
-    if (bAsync)
+    switch (transferKind)
      {
-        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
-    }
-    else
-    {
-        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
-        assert(cl_error == CL_SUCCESS);
-        // TODO: handle errors
+        case GpuApiCallBehavior::Async:
+            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+            break;
+
+        default:
+            throw;
      }
  
      return 0;
@@ -155,7 +154,7 @@ int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
                         cl_command_queue command_queue,
                         cl_event *copy_event)
  {
-    return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+    return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
  }
  
  /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 232ad4f5fea7567b91ddfca34c8d748408e0e495..184eb6dbfbca93e041df7d07e8e31d84d1133735 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -46,6 +46,8 @@
  #include "gromacs/gpu_utils/gmxopencl.h"
  #include "gromacs/utility/gmxassert.h"
  
+enum class GpuApiCallBehavior;
+
  /*! \brief OpenCL vendor IDs */
  typedef enum {
      OCL_VENDOR_NVIDIA = 0,
@@ -101,15 +103,39 @@ struct gmx_device_runtime_data_t
      cl_program program; /**< OpenCL program */
  };
  
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular device to host operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event);
  
-/*! \brief Launches asynchronous host to device memory copy. */
-int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
                         size_t offset, size_t bytes,
                         cl_command_queue command_queue,
                         cl_event *copy_event);
  
-/*! \brief Launches asynchronous device to host memory copy. */
-int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+                 size_t offset, size_t bytes,
+                 GpuApiCallBehavior transferKind,
+                 cl_command_queue command_queue,
+                 cl_event *copy_event);
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
                         size_t offset, size_t bytes,
                         cl_command_queue command_queue,
                         cl_event *copy_event);
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Thu, 9 Nov 2017 18:01:43 +0000 (19:01 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 15 Nov 2017 08:35:31 +0000 (09:35 +0100)
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types.h		patch \| blob \| history
src/gromacs/ewald/pme-solve.cu		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cu		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.cpp		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history