Remove leftover support for pre-9.0 CUDA

author Mark Abraham <mark.j.abraham@gmail.com>

Tue, 13 Aug 2019 12:31:57 +0000 (14:31 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 27 Sep 2019 07:02:46 +0000 (09:02 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Tue, 13 Aug 2019 12:31:57 +0000 (14:31 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 27 Sep 2019 07:02:46 +0000 (09:02 +0200)
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh

index e2ad100e48d07a7b774671fd0a311d37d9d96b32..52ed41d70fb40f4ebdbf078b83d477f1f05aa3fb 100644 (file)
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -351,7 +351,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
      }
  
  #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
-    /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+    /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
       * #2519
       */
      barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index 6c29ebcf2156670a5ef779da86e697220a7c079e..be34c3c6cda751526bb88c070d2276f4cbb648bd 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -100,17 +100,17 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
          static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
          const int width = atomDataSize;
  
-        fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
-        fy += gmx_shfl_up_sync  (activeMask, fy, 1, width);
-        fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
+        fx += __shfl_down_sync(activeMask, fx, 1, width);
+        fy += __shfl_up_sync  (activeMask, fy, 1, width);
+        fz += __shfl_down_sync(activeMask, fz, 1, width);
  
          if (splineIndex & 1)
          {
              fx = fy;
          }
  
-        fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
-        fz += gmx_shfl_up_sync  (activeMask, fz, 2, width);
+        fx += __shfl_down_sync(activeMask, fx, 2, width);
+        fz += __shfl_up_sync  (activeMask, fz, 2, width);
  
          if (splineIndex & 2)
          {
@@ -124,7 +124,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
          // We have to just further reduce those groups of 4
          for (int delta = 4; delta < atomDataSize; delta <<= 1)
          {
-            fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
+            fx += __shfl_down_sync(activeMask, fx, delta, width);
          }
  
          const int dimIndex = splineIndex;
@@ -194,7 +194,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
                  }
              }
  
-            gmx_syncwarp();
+            __syncwarp();
  
              const float n         = read_grid_size(realGridSizeFP, dimIndex);
              const int   atomIndex = sourceIndex / minStride;
@@ -378,7 +378,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
          sm_forces[forceIndexLocal] = result;
      }
  
-    gmx_syncwarp();
+    __syncwarp();
      assert(atomsPerBlock <= warp_size);
  
      /* Writing or adding the final forces component-wise, single warp */
diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu

index a48680283dfb4b6b4346dd6b0553f59530c1c9e3..401d82f21844645d7ca0aca934fbe5ca173a9b46 100644 (file)
--- a/src/gromacs/ewald/pme_solve.cu
+++ b/src/gromacs/ewald/pme_solve.cu
@@ -252,13 +252,13 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
          const unsigned int activeMask = c_fullWarpMask;
  
          /* Making pair sums */
-        virxx  += gmx_shfl_down_sync(activeMask, virxx, 1, width);
-        viryy  += gmx_shfl_up_sync  (activeMask, viryy, 1, width);
-        virzz  += gmx_shfl_down_sync(activeMask, virzz, 1, width);
-        virxy  += gmx_shfl_up_sync  (activeMask, virxy, 1, width);
-        virxz  += gmx_shfl_down_sync(activeMask, virxz, 1, width);
-        viryz  += gmx_shfl_up_sync  (activeMask, viryz, 1, width);
-        energy += gmx_shfl_down_sync(activeMask, energy, 1, width);
+        virxx  += __shfl_down_sync(activeMask, virxx, 1, width);
+        viryy  += __shfl_up_sync  (activeMask, viryy, 1, width);
+        virzz  += __shfl_down_sync(activeMask, virzz, 1, width);
+        virxy  += __shfl_up_sync  (activeMask, virxy, 1, width);
+        virxz  += __shfl_down_sync(activeMask, virxz, 1, width);
+        viryz  += __shfl_up_sync  (activeMask, viryz, 1, width);
+        energy += __shfl_down_sync(activeMask, energy, 1, width);
          if (threadLocalId & 1)
          {
              virxx = viryy; // virxx now holds virxx and viryy pair sums
@@ -267,10 +267,10 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
          }
  
          /* Making quad sums */
-        virxx  += gmx_shfl_down_sync(activeMask, virxx, 2, width);
-        virzz  += gmx_shfl_up_sync  (activeMask, virzz, 2, width);
-        virxz  += gmx_shfl_down_sync(activeMask, virxz, 2, width);
-        energy += gmx_shfl_up_sync  (activeMask, energy, 2, width);
+        virxx  += __shfl_down_sync(activeMask, virxx, 2, width);
+        virzz  += __shfl_up_sync  (activeMask, virzz, 2, width);
+        virxz  += __shfl_down_sync(activeMask, virxz, 2, width);
+        energy += __shfl_up_sync  (activeMask, energy, 2, width);
          if (threadLocalId & 2)
          {
              virxx = virzz;  // virxx now holds quad sums of virxx, virxy, virzz and virxy
@@ -278,8 +278,8 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
          }
  
          /* Making octet sums */
-        virxx += gmx_shfl_down_sync(activeMask, virxx, 4, width);
-        virxz += gmx_shfl_up_sync  (activeMask, virxz, 4, width);
+        virxx += __shfl_down_sync(activeMask, virxx, 4, width);
+        virxz += __shfl_up_sync  (activeMask, virxz, 4, width);
          if (threadLocalId & 4)
          {
              virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings
@@ -289,7 +289,7 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
  #pragma unroll
          for (int delta = 8; delta < width; delta <<= 1)
          {
-            virxx += gmx_shfl_down_sync(activeMask, virxx, delta, width);
+            virxx += __shfl_down_sync(activeMask, virxx, delta, width);
          }
          /* Now first 7 threads of each warp have the full output contributions in virxx */
  
@@ -334,7 +334,7 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
  #pragma unroll
              for (int delta = stride; delta < warp_size; delta <<= 1)
              {
-                output += gmx_shfl_down_sync(activeMask, output, delta, warp_size);
+                output += __shfl_down_sync(activeMask, output, delta, warp_size);
              }
              /* Final output */
              if (validComponentIndex)
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh

index 2b140133c11adb06eec94e9e2542927b496160b2..e64d22382270bdb4af02cb41d09e9757031d3a0d 100644 (file)
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -428,7 +428,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_spline_and_spread_kernel)(const struct
                            sm_fractCoords, gm_theta, gm_dtheta, gm_gridlineIndices,
                            gm_fractShiftsTable, gm_gridlineIndicesTable);
  #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
-        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
           * #2519
           */
          barrier(CLK_LOCAL_MEM_FENCE);
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index aa143ca451046b12402ee80da06f47d7a0b89624..4037a71f7683a8db0d72143f5e42a85c8d6e07aa 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -449,7 +449,7 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
          __syncthreads();
          calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
                                                  sm_coefficients, sm_theta, sm_gridlineIndices);
-        gmx_syncwarp();
+        __syncwarp();
      }
      else
      {
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh

index d722aa1feb83041cd3b1dcef065a4bafbb6f38cc..8f97d1ab932a6a5dd8c5ba3c15f95a43ec043be7 100644 (file)
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -35,8 +35,6 @@
  #ifndef CUDA_ARCH_UTILS_CUH_
  #define CUDA_ARCH_UTILS_CUH_
  
-#include "config.h"
-
  #include "gromacs/utility/basedefinitions.h"
  
  /*! \file
@@ -66,78 +64,6 @@ static const int warp_size_log2 = 5;
   */
  static const unsigned int c_fullWarpMask = 0xffffffff;
  
-/* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
-
-/*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
-static __forceinline__ __device__
-void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
-{
-#if GMX_CUDA_VERSION < 9000
-    /* no sync needed on pre-Volta. */
-    GMX_UNUSED_VALUE(activeMask);
-#else
-    __syncwarp(activeMask);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
-static __forceinline__ __device__
-unsigned int gmx_ballot_sync(const unsigned int activeMask,
-                             const int          pred)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __ballot(pred);
-#else
-    return __ballot_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
-static __forceinline__ __device__
-int gmx_any_sync(const unsigned int activeMask,
-                 const int          pred)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __any(pred);
-#else
-    return __any_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_up_sync(const unsigned int activeMask,
-                   const T            var,
-                   unsigned int       offset,
-                   int                width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __shfl_up(var, offset, width);
-#else
-    return __shfl_up_sync(activeMask, var, offset, width);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_down_sync(const unsigned int activeMask,
-                     const T            var,
-                     unsigned int       offset,
-                     int                width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __shfl_down(var, offset, width);
-#else
-    return __shfl_down_sync(activeMask, var, offset, width);
-#endif
-}
-
  /*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
   *
   *  Only texture objects supported.
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

index b825e2221b5c241ac3464de09cd9fd6babe42a73..7110a8a40550c6d16446bda6759de90421ce996e 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -390,7 +390,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
              {
                  cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
              }
-            gmx_syncwarp(c_fullWarpMask);
+            __syncwarp(c_fullWarpMask);
  
              /* Unrolling this loop
                 - with pruning leads to register spilling;
@@ -438,7 +438,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                              /* If _none_ of the atoms pairs are in cutoff range,
                                 the bit corresponding to the current
                                 cluster-pair in imask gets set to 0. */
-                            if (!gmx_any_sync(c_fullWarpMask, r2 < rlist_sq))
+                            if (!__any_sync(c_fullWarpMask, r2 < rlist_sq))
                              {
                                  imask &= ~mask_ji;
                              }
@@ -609,7 +609,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #endif
          }
          // avoid shared memory WAR hazards between loop iterations
-        gmx_syncwarp(c_fullWarpMask);
+        __syncwarp(c_fullWarpMask);
      }
  
      /* skip central shifts when summing shift forces */
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh

index 1993a4c181ee8e4e63334ee2c5e94683f3e17091..994951babbe18919132b7c594be440bce31676a9 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
@@ -217,7 +217,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
              {
                  cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
              }
-            gmx_syncwarp(c_fullWarpMask);
+            __syncwarp(c_fullWarpMask);
  
  #pragma unroll 4
              for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
@@ -249,13 +249,13 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
                              /* If _none_ of the atoms pairs are in rlistOuter
                                 range, the bit corresponding to the current
                                 cluster-pair in imask gets set to 0. */
-                            if (haveFreshList && !gmx_any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
+                            if (haveFreshList && !__any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
                              {
                                  imaskFull &= ~mask_ji;
                              }
                              /* If any atom pair is within range, set the bit
                                 corresponding to the current cluster-pair. */
-                            if (gmx_any_sync(c_fullWarpMask, r2 < rlistInner_sq))
+                            if (__any_sync(c_fullWarpMask, r2 < rlistInner_sq))
                              {
                                  imaskNew |= mask_ji;
                              }
@@ -276,7 +276,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
              plist.cj4[j4].imei[widx].imask = imaskNew;
          }
          // avoid shared memory WAR hazards between loop iterations
-        gmx_syncwarp(c_fullWarpMask);
+        __syncwarp(c_fullWarpMask);
      }
  }
  #endif /* FUNCTION_DECLARATION_ONLY */
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index 758e5ec55d16c6ea3799dda3c8844016c03e6e5f..e152de7fe9df136667d6bbcb7d1746ad59516b05 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -514,24 +514,24 @@ void reduce_force_j_warp_shfl(float3 f, float3 *fout,
                                int tidxi, int aidx,
                                const unsigned int activemask)
  {
-    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
-    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
-    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+    f.x += __shfl_down_sync(activemask, f.x, 1);
+    f.y += __shfl_up_sync  (activemask, f.y, 1);
+    f.z += __shfl_down_sync(activemask, f.z, 1);
  
      if (tidxi & 1)
      {
          f.x = f.y;
      }
  
-    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
-    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
+    f.x += __shfl_down_sync(activemask, f.x, 2);
+    f.z += __shfl_up_sync  (activemask, f.z, 2);
  
      if (tidxi & 2)
      {
          f.x = f.z;
      }
  
-    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+    f.x += __shfl_down_sync(activemask, f.x, 4);
  
      if (tidxi < 3)
      {
@@ -640,17 +640,17 @@ void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                                int tidxj, int aidx,
                                const unsigned int activemask)
  {
-    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
-    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
-    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+    fin.x += __shfl_down_sync(activemask, fin.x, c_clSize);
+    fin.y += __shfl_up_sync  (activemask, fin.y, c_clSize);
+    fin.z += __shfl_down_sync(activemask, fin.z, c_clSize);
  
      if (tidxj & 1)
      {
          fin.x = fin.y;
      }
  
-    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
-    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
+    fin.x += __shfl_down_sync(activemask, fin.x, 2*c_clSize);
+    fin.z += __shfl_up_sync  (activemask, fin.z, 2*c_clSize);
  
      if (tidxj & 2)
      {
@@ -721,8 +721,8 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
  #pragma unroll 5
      for (i = 0; i < 5; i++)
      {
-        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
-        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+        E_lj += __shfl_down_sync(activemask, E_lj, sh);
+        E_el += __shfl_down_sync(activemask, E_el, sh);
          sh   += sh;
      }
author	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 13 Aug 2019 12:31:57 +0000 (14:31 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 27 Sep 2019 07:02:46 +0000 (09:02 +0200)
src/gromacs/ewald/pme_gather.clh		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_solve.cu		patch \| blob \| history
src/gromacs/ewald/pme_spread.clh		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history
src/gromacs/gpu_utils/cuda_arch_utils.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh		patch \| blob \| history