Merge branch release-2016

[alexxy/gromacs.git] / src / gromacs / gpu_utils / cuda_arch_utils.cuh
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh

index efd59644a965d10783ec420bd497d2384129d02f..a60bf9ebb80537871ba7f5b52100458ae62c6509 100644 (file)
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -35,6 +35,8 @@
  #ifndef CUDA_ARCH_UTILS_CUH_
  #define CUDA_ARCH_UTILS_CUH_
  
+#include "config.h"
+
  /*! \file
   *  \brief CUDA arch dependent definitions.
   *
@@ -57,6 +59,80 @@
   */
  static const int warp_size      = 32;
  static const int warp_size_log2 = 5;
+/*! \brief Bitmask corresponding to all threads active in a warp.
+ *  NOTE that here too we assume 32-wide warps.
+ */
+static const unsigned int c_fullWarpMask = 0xffffffff;
+
+/* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
+
+/*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic.  */
+static __forceinline__ __device__
+void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
+{
+#if GMX_CUDA_VERSION < 9000
+    /* no sync needed on pre-Volta. */
+    GMX_UNUSED_VALUE(activeMask);
+#else
+    __syncwarp(activeMask);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic.  */
+static __forceinline__ __device__
+unsigned int gmx_ballot_sync(const unsigned int activeMask,
+                             const int          pred)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __ballot(pred);
+#else
+    return __ballot_sync(activeMask, pred);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic.  */
+static __forceinline__ __device__
+int gmx_any_sync(const unsigned int activeMask,
+                 const int          pred)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __any(pred);
+#else
+    return __any_sync(activeMask, pred);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic.  */
+template <typename T>
+static __forceinline__ __device__
+T gmx_shfl_up_sync(const unsigned int activeMask,
+                   const T            var,
+                   unsigned int       offset)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __shfl_up(var, offset);
+#else
+    return __shfl_up_sync(activeMask, var, offset);
+#endif
+}
+
+/*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic.  */
+template <typename T>
+static __forceinline__ __device__
+T gmx_shfl_down_sync(const unsigned int activeMask,
+                     const T            var,
+                     unsigned int       offset)
+{
+#if GMX_CUDA_VERSION < 9000
+    GMX_UNUSED_VALUE(activeMask);
+    return __shfl_down(var, offset);
+#else
+    return __shfl_down_sync(activeMask, var, offset);
+#endif
+}
  
  /*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
   *