Remove leftover support for pre-9.0 CUDA
authorMark Abraham <mark.j.abraham@gmail.com>
Tue, 13 Aug 2019 12:31:57 +0000 (14:31 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 27 Sep 2019 07:02:46 +0000 (09:02 +0200)
Refs #2831

Change-Id: I7ec33bb3582006123e745d06da27c9eed12fbfc2

src/gromacs/ewald/pme_gather.clh
src/gromacs/ewald/pme_gather.cu
src/gromacs/ewald/pme_solve.cu
src/gromacs/ewald/pme_spread.clh
src/gromacs/ewald/pme_spread.cu
src/gromacs/gpu_utils/cuda_arch_utils.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index e2ad100e48d07a7b774671fd0a311d37d9d96b32..52ed41d70fb40f4ebdbf078b83d477f1f05aa3fb 100644 (file)
@@ -351,7 +351,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe
     }
 
 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
-    /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+    /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
      * #2519
      */
     barrier(CLK_LOCAL_MEM_FENCE);
index 6c29ebcf2156670a5ef779da86e697220a7c079e..be34c3c6cda751526bb88c070d2276f4cbb648bd 100644 (file)
@@ -100,17 +100,17 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
         static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
         const int width = atomDataSize;
 
-        fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
-        fy += gmx_shfl_up_sync  (activeMask, fy, 1, width);
-        fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
+        fx += __shfl_down_sync(activeMask, fx, 1, width);
+        fy += __shfl_up_sync  (activeMask, fy, 1, width);
+        fz += __shfl_down_sync(activeMask, fz, 1, width);
 
         if (splineIndex & 1)
         {
             fx = fy;
         }
 
-        fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
-        fz += gmx_shfl_up_sync  (activeMask, fz, 2, width);
+        fx += __shfl_down_sync(activeMask, fx, 2, width);
+        fz += __shfl_up_sync  (activeMask, fz, 2, width);
 
         if (splineIndex & 2)
         {
@@ -124,7 +124,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
         // We have to just further reduce those groups of 4
         for (int delta = 4; delta < atomDataSize; delta <<= 1)
         {
-            fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
+            fx += __shfl_down_sync(activeMask, fx, delta, width);
         }
 
         const int dimIndex = splineIndex;
@@ -194,7 +194,7 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
                 }
             }
 
-            gmx_syncwarp();
+            __syncwarp();
 
             const float n         = read_grid_size(realGridSizeFP, dimIndex);
             const int   atomIndex = sourceIndex / minStride;
@@ -378,7 +378,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
         sm_forces[forceIndexLocal] = result;
     }
 
-    gmx_syncwarp();
+    __syncwarp();
     assert(atomsPerBlock <= warp_size);
 
     /* Writing or adding the final forces component-wise, single warp */
index a48680283dfb4b6b4346dd6b0553f59530c1c9e3..401d82f21844645d7ca0aca934fbe5ca173a9b46 100644 (file)
@@ -252,13 +252,13 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
         const unsigned int activeMask = c_fullWarpMask;
 
         /* Making pair sums */
-        virxx  += gmx_shfl_down_sync(activeMask, virxx, 1, width);
-        viryy  += gmx_shfl_up_sync  (activeMask, viryy, 1, width);
-        virzz  += gmx_shfl_down_sync(activeMask, virzz, 1, width);
-        virxy  += gmx_shfl_up_sync  (activeMask, virxy, 1, width);
-        virxz  += gmx_shfl_down_sync(activeMask, virxz, 1, width);
-        viryz  += gmx_shfl_up_sync  (activeMask, viryz, 1, width);
-        energy += gmx_shfl_down_sync(activeMask, energy, 1, width);
+        virxx  += __shfl_down_sync(activeMask, virxx, 1, width);
+        viryy  += __shfl_up_sync  (activeMask, viryy, 1, width);
+        virzz  += __shfl_down_sync(activeMask, virzz, 1, width);
+        virxy  += __shfl_up_sync  (activeMask, virxy, 1, width);
+        virxz  += __shfl_down_sync(activeMask, virxz, 1, width);
+        viryz  += __shfl_up_sync  (activeMask, viryz, 1, width);
+        energy += __shfl_down_sync(activeMask, energy, 1, width);
         if (threadLocalId & 1)
         {
             virxx = viryy; // virxx now holds virxx and viryy pair sums
@@ -267,10 +267,10 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
         }
 
         /* Making quad sums */
-        virxx  += gmx_shfl_down_sync(activeMask, virxx, 2, width);
-        virzz  += gmx_shfl_up_sync  (activeMask, virzz, 2, width);
-        virxz  += gmx_shfl_down_sync(activeMask, virxz, 2, width);
-        energy += gmx_shfl_up_sync  (activeMask, energy, 2, width);
+        virxx  += __shfl_down_sync(activeMask, virxx, 2, width);
+        virzz  += __shfl_up_sync  (activeMask, virzz, 2, width);
+        virxz  += __shfl_down_sync(activeMask, virxz, 2, width);
+        energy += __shfl_up_sync  (activeMask, energy, 2, width);
         if (threadLocalId & 2)
         {
             virxx = virzz;  // virxx now holds quad sums of virxx, virxy, virzz and virxy
@@ -278,8 +278,8 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
         }
 
         /* Making octet sums */
-        virxx += gmx_shfl_down_sync(activeMask, virxx, 4, width);
-        virxz += gmx_shfl_up_sync  (activeMask, virxz, 4, width);
+        virxx += __shfl_down_sync(activeMask, virxx, 4, width);
+        virxz += __shfl_up_sync  (activeMask, virxz, 4, width);
         if (threadLocalId & 4)
         {
             virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings
@@ -289,7 +289,7 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
 #pragma unroll
         for (int delta = 8; delta < width; delta <<= 1)
         {
-            virxx += gmx_shfl_down_sync(activeMask, virxx, delta, width);
+            virxx += __shfl_down_sync(activeMask, virxx, delta, width);
         }
         /* Now first 7 threads of each warp have the full output contributions in virxx */
 
@@ -334,7 +334,7 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
 #pragma unroll
             for (int delta = stride; delta < warp_size; delta <<= 1)
             {
-                output += gmx_shfl_down_sync(activeMask, output, delta, warp_size);
+                output += __shfl_down_sync(activeMask, output, delta, warp_size);
             }
             /* Final output */
             if (validComponentIndex)
index 2b140133c11adb06eec94e9e2542927b496160b2..e64d22382270bdb4af02cb41d09e9757031d3a0d 100644 (file)
@@ -428,7 +428,7 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_spline_and_spread_kernel)(const struct
                           sm_fractCoords, gm_theta, gm_dtheta, gm_gridlineIndices,
                           gm_fractShiftsTable, gm_gridlineIndicesTable);
 #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
-        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
          * #2519
          */
         barrier(CLK_LOCAL_MEM_FENCE);
index aa143ca451046b12402ee80da06f47d7a0b89624..4037a71f7683a8db0d72143f5e42a85c8d6e07aa 100644 (file)
@@ -449,7 +449,7 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
         __syncthreads();
         calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
                                                 sm_coefficients, sm_theta, sm_gridlineIndices);
-        gmx_syncwarp();
+        __syncwarp();
     }
     else
     {
index d722aa1feb83041cd3b1dcef065a4bafbb6f38cc..8f97d1ab932a6a5dd8c5ba3c15f95a43ec043be7 100644 (file)
@@ -35,8 +35,6 @@
 #ifndef CUDA_ARCH_UTILS_CUH_
 #define CUDA_ARCH_UTILS_CUH_
 
-#include "config.h"
-
 #include "gromacs/utility/basedefinitions.h"
 
 /*! \file
@@ -66,78 +64,6 @@ static const int warp_size_log2 = 5;
  */
 static const unsigned int c_fullWarpMask = 0xffffffff;
 
-/* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
-
-/*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
-static __forceinline__ __device__
-void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
-{
-#if GMX_CUDA_VERSION < 9000
-    /* no sync needed on pre-Volta. */
-    GMX_UNUSED_VALUE(activeMask);
-#else
-    __syncwarp(activeMask);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
-static __forceinline__ __device__
-unsigned int gmx_ballot_sync(const unsigned int activeMask,
-                             const int          pred)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __ballot(pred);
-#else
-    return __ballot_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
-static __forceinline__ __device__
-int gmx_any_sync(const unsigned int activeMask,
-                 const int          pred)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __any(pred);
-#else
-    return __any_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_up_sync(const unsigned int activeMask,
-                   const T            var,
-                   unsigned int       offset,
-                   int                width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __shfl_up(var, offset, width);
-#else
-    return __shfl_up_sync(activeMask, var, offset, width);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_down_sync(const unsigned int activeMask,
-                     const T            var,
-                     unsigned int       offset,
-                     int                width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
-    GMX_UNUSED_VALUE(activeMask);
-    return __shfl_down(var, offset, width);
-#else
-    return __shfl_down_sync(activeMask, var, offset, width);
-#endif
-}
-
 /*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
  *
  *  Only texture objects supported.
index b825e2221b5c241ac3464de09cd9fd6babe42a73..7110a8a40550c6d16446bda6759de90421ce996e 100644 (file)
@@ -390,7 +390,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
-            gmx_syncwarp(c_fullWarpMask);
+            __syncwarp(c_fullWarpMask);
 
             /* Unrolling this loop
                - with pruning leads to register spilling;
@@ -438,7 +438,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                             /* If _none_ of the atoms pairs are in cutoff range,
                                the bit corresponding to the current
                                cluster-pair in imask gets set to 0. */
-                            if (!gmx_any_sync(c_fullWarpMask, r2 < rlist_sq))
+                            if (!__any_sync(c_fullWarpMask, r2 < rlist_sq))
                             {
                                 imask &= ~mask_ji;
                             }
@@ -609,7 +609,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
         }
         // avoid shared memory WAR hazards between loop iterations
-        gmx_syncwarp(c_fullWarpMask);
+        __syncwarp(c_fullWarpMask);
     }
 
     /* skip central shifts when summing shift forces */
index 1993a4c181ee8e4e63334ee2c5e94683f3e17091..994951babbe18919132b7c594be440bce31676a9 100644 (file)
@@ -217,7 +217,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
-            gmx_syncwarp(c_fullWarpMask);
+            __syncwarp(c_fullWarpMask);
 
 #pragma unroll 4
             for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
@@ -249,13 +249,13 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
                             /* If _none_ of the atoms pairs are in rlistOuter
                                range, the bit corresponding to the current
                                cluster-pair in imask gets set to 0. */
-                            if (haveFreshList && !gmx_any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
+                            if (haveFreshList && !__any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
                             {
                                 imaskFull &= ~mask_ji;
                             }
                             /* If any atom pair is within range, set the bit
                                corresponding to the current cluster-pair. */
-                            if (gmx_any_sync(c_fullWarpMask, r2 < rlistInner_sq))
+                            if (__any_sync(c_fullWarpMask, r2 < rlistInner_sq))
                             {
                                 imaskNew |= mask_ji;
                             }
@@ -276,7 +276,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
             plist.cj4[j4].imei[widx].imask = imaskNew;
         }
         // avoid shared memory WAR hazards between loop iterations
-        gmx_syncwarp(c_fullWarpMask);
+        __syncwarp(c_fullWarpMask);
     }
 }
 #endif /* FUNCTION_DECLARATION_ONLY */
index 758e5ec55d16c6ea3799dda3c8844016c03e6e5f..e152de7fe9df136667d6bbcb7d1746ad59516b05 100644 (file)
@@ -514,24 +514,24 @@ void reduce_force_j_warp_shfl(float3 f, float3 *fout,
                               int tidxi, int aidx,
                               const unsigned int activemask)
 {
-    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
-    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
-    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+    f.x += __shfl_down_sync(activemask, f.x, 1);
+    f.y += __shfl_up_sync  (activemask, f.y, 1);
+    f.z += __shfl_down_sync(activemask, f.z, 1);
 
     if (tidxi & 1)
     {
         f.x = f.y;
     }
 
-    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
-    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
+    f.x += __shfl_down_sync(activemask, f.x, 2);
+    f.z += __shfl_up_sync  (activemask, f.z, 2);
 
     if (tidxi & 2)
     {
         f.x = f.z;
     }
 
-    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+    f.x += __shfl_down_sync(activemask, f.x, 4);
 
     if (tidxi < 3)
     {
@@ -640,17 +640,17 @@ void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                               int tidxj, int aidx,
                               const unsigned int activemask)
 {
-    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
-    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
-    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+    fin.x += __shfl_down_sync(activemask, fin.x, c_clSize);
+    fin.y += __shfl_up_sync  (activemask, fin.y, c_clSize);
+    fin.z += __shfl_down_sync(activemask, fin.z, c_clSize);
 
     if (tidxj & 1)
     {
         fin.x = fin.y;
     }
 
-    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
-    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
+    fin.x += __shfl_down_sync(activemask, fin.x, 2*c_clSize);
+    fin.z += __shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 
     if (tidxj & 2)
     {
@@ -721,8 +721,8 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
 #pragma unroll 5
     for (i = 0; i < 5; i++)
     {
-        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
-        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+        E_lj += __shfl_down_sync(activemask, E_lj, sh);
+        E_el += __shfl_down_sync(activemask, E_el, sh);
         sh   += sh;
     }