Support Simd4N for SimdRealWidth<4

author Roland Schulz <roland.schulz@intel.com>

Mon, 11 Dec 2017 11:07:51 +0000 (03:07 -0800)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sun, 17 Dec 2017 07:38:29 +0000 (08:38 +0100)
author Roland Schulz <roland.schulz@intel.com>
Mon, 11 Dec 2017 11:07:51 +0000 (03:07 -0800)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sun, 17 Dec 2017 07:38:29 +0000 (08:38 +0100)
diff --git a/docs/doxygen/lib/simd.md b/docs/doxygen/lib/simd.md

index 66f717d73216b0ed5488dfa6e1d3878e668c0d62..e597d1479cedd7c732c087452cc9bbdce23d3e94 100644 (file)
--- a/docs/doxygen/lib/simd.md
+++ b/docs/doxygen/lib/simd.md
@@ -374,7 +374,26 @@ Unfortunately reality is not that simple. Some algorithms like lattice
  summation need quartets of elements, so even when the SIMD width is >4 we
  need width-4 SIMD if it is supported. The availability of SIMD4 is indicated
  by \ref GMX_SIMD4_HAVE_FLOAT and \ref GMX_SIMD4_HAVE_DOUBLE. For now we only
-support a small subset of SIMD operations for SIMD4.
+support a small subset of SIMD operations for SIMD4. Because SIMD4 doesn't
+scale with increasingly large SIMD width it should be avoided for all new
+code and SIMD4N should be used instead.
+
+SIMD4N implementation
+---------------------
+
+Some code, like lattice summation, has inner loops which are smaller
+than the full SIMD width. In GROMACS algorithms 3 and 4 iterations are common
+because of PME order and three dimensions. This makes 4 an important special
+case. Vectorizing such loops efficiently requires to collapse the two
+most inner loops and using e.g. one 8-wide SIMD vector for 2 outer
+and 4 inner iterations or one 16-wide SIMD vector for 4 outer and 4 inner
+iterations. For this SIMD4N functions are
+provided. The availability of these function is indicated by
+\ref GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT and
+\ref GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE.
+These functions return the type alias Simd4NFloat / Simd4NDouble which is
+either the normal SIMD type or the SIMD4 type and thus only supports
+the operations the SIMD4 type supports.
  
  Predefined SIMD preprocessor macros
  ===================================
diff --git a/src/gromacs/ewald/pme-gather.cpp b/src/gromacs/ewald/pme-gather.cpp

index d94c6fab84c54b130300170960e4815e840ceaaa..2c9e961feef8c5bf448aea7e351098463265cf21 100644 (file)
--- a/src/gromacs/ewald/pme-gather.cpp
+++ b/src/gromacs/ewald/pme-gather.cpp
@@ -148,32 +148,32 @@ struct do_fspline
          const real *const gmx_restrict dthy = spline->dtheta[YY] + norder;
          const real *const gmx_restrict dthz = spline->dtheta[ZZ] + norder;
  
-        SimdReal                       fx_S = setZero();
-        SimdReal                       fy_S = setZero();
-        SimdReal                       fz_S = setZero();
+        Simd4NReal                     fx_S = setZero();
+        Simd4NReal                     fy_S = setZero();
+        Simd4NReal                     fz_S = setZero();
  
          /* With order 4 the z-spline is actually aligned */
-        const SimdReal tz_S = load4DuplicateN(thz);
-        const SimdReal dz_S = load4DuplicateN(dthz);
+        const Simd4NReal tz_S = load4DuplicateN(thz);
+        const Simd4NReal dz_S = load4DuplicateN(dthz);
  
          for (int ithx = 0; ithx < 4; ithx++)
          {
-            const int      index_x = (idxX + ithx)*gridNY*gridNZ;
-            const SimdReal tx_S    = SimdReal(thx[ithx]);
-            const SimdReal dx_S    = SimdReal(dthx[ithx]);
+            const int        index_x = (idxX + ithx)*gridNY*gridNZ;
+            const Simd4NReal tx_S    = Simd4NReal(thx[ithx]);
+            const Simd4NReal dx_S    = Simd4NReal(dthx[ithx]);
  
-            for (int ithy = 0; ithy < 4; ithy += GMX_SIMD_REAL_WIDTH/4)
+            for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH/4)
              {
-                const int      index_xy = index_x + (idxY+ithy)*gridNZ;
+                const int        index_xy = index_x + (idxY+ithy)*gridNZ;
  
-                const SimdReal ty_S = loadUNDuplicate4(thy +ithy);
-                const SimdReal dy_S = loadUNDuplicate4(dthy+ithy);
+                const Simd4NReal ty_S = loadUNDuplicate4(thy +ithy);
+                const Simd4NReal dy_S = loadUNDuplicate4(dthy+ithy);
  
-                const SimdReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
+                const Simd4NReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
  
  
-                const SimdReal fxy1_S = tz_S * gval_S;
-                const SimdReal fz1_S  = dz_S * gval_S;
+                const Simd4NReal fxy1_S = tz_S * gval_S;
+                const Simd4NReal fz1_S  = dz_S * gval_S;
  
                  fx_S = fma(dx_S * ty_S, fxy1_S, fx_S);
                  fy_S = fma(tx_S * dy_S, fxy1_S, fy_S);
diff --git a/src/gromacs/simd/simd.h b/src/gromacs/simd/simd.h

index 45d4baf6daf7687f62b3bb1474ed579312ad920a..673cfa336950c63430e671962a9f9b51b987339a 100644 (file)
--- a/src/gromacs/simd/simd.h
+++ b/src/gromacs/simd/simd.h
@@ -402,7 +402,17 @@ typedef Simd4FBool                Simd4Bool;
   * \{
   */
  
-/*! \libinternal \brief Simd traits */
+namespace internal
+{
+/*! \libinternal \brief Simd traits
+ *
+ * These traits are used to query data about SIMD types. Currently provided
+ * data is useful for SIMD loads (load function and helper classes for
+ * ArrayRef<> in simd_memory.h). Provided data:
+ *  - type: scalar type corresponding to the SIMD type
+ *  - width: SIMD width
+ *  - tag: tag used for type dispatch of load function
+ */
  template<typename T>
  struct SimdTraits {};
  
@@ -450,6 +460,7 @@ struct SimdTraits<const T>
      static constexpr int width = SimdTraits<T>::width;
      using tag = typename SimdTraits<T>::tag;
  };
+}   //namespace internal
  
  /*! \brief Load function that returns SIMD or scalar
   *
@@ -459,9 +470,9 @@ struct SimdTraits<const T>
   */
  template<typename T>
  static inline T
-load(const typename SimdTraits<T>::type *m) //disabled by SFINAE for non-SIMD types
+load(const typename internal::SimdTraits<T>::type *m) //disabled by SFINAE for non-SIMD types
  {
-    return simdLoad(m, typename SimdTraits<T>::tag());
+    return simdLoad(m, typename internal::SimdTraits<T>::tag());
  }
  
  template<typename T>
@@ -478,9 +489,9 @@ load(const typename std::enable_if<std::is_arithmetic<T>::value, T>::type *m)
  
  template <typename T, size_t N>
  static inline T gmx_simdcall
-load(const AlignedArray<typename SimdTraits<T>::type, N> &m)
+load(const AlignedArray<typename internal::SimdTraits<T>::type, N> &m)
  {
-    return simdLoad(m.data(), typename SimdTraits<T>::tag());
+    return simdLoad(m.data(), typename internal::SimdTraits<T>::tag());
  }
  
  /*! \brief Load function that returns SIMD or scalar based on template argument
@@ -491,9 +502,9 @@ load(const AlignedArray<typename SimdTraits<T>::type, N> &m)
   */
  template<typename T>
  static inline T
-loadU(const typename SimdTraits<T>::type *m)
+loadU(const typename internal::SimdTraits<T>::type *m)
  {
-    return simdLoadU(m, typename SimdTraits<T>::tag());
+    return simdLoadU(m, typename internal::SimdTraits<T>::tag());
  }
  
  template<typename T>
@@ -505,9 +516,9 @@ loadU(const typename std::enable_if<std::is_arithmetic<T>::value, T>::type *m)
  
  template <typename T, size_t N>
  static inline T gmx_simdcall
-loadU(const AlignedArray<typename SimdTraits<T>::type, N> &m)
+loadU(const AlignedArray<typename internal::SimdTraits<T>::type, N> &m)
  {
-    return simdLoadU(m.data(), typename SimdTraits<T>::tag());
+    return simdLoadU(m.data(), typename internal::SimdTraits<T>::tag());
  }
  
  class SimdSetZeroProxyInternal;
@@ -572,89 +583,145 @@ setZero()
      return {};
  }
  
+namespace internal
+{
+//TODO: Don't foward function but properly rename them and use proper traits
+template<typename T>
+struct Simd4Traits {};
+
+#if GMX_SIMD4_HAVE_FLOAT
+template<>
+struct Simd4Traits<Simd4Float>
+{
+    using type = float;
+};
+#endif
+
+#if GMX_SIMD4_HAVE_DOUBLE
+template<>
+struct Simd4Traits<Simd4Double>
+{
+    using type = double;
+};
+#endif
+}   //namespace internal
+
+#if GMX_SIMD4_HAVE_REAL
+template<typename T>
+T load(const typename internal::Simd4Traits<T>::type* m)
+{
+    return load4(m);
+}
+template<typename T>
+T loadU(const typename internal::Simd4Traits<T>::type* m)
+{
+    return load4U(m);
+}
+#endif
+
  /* Implement most of 4xn functions by forwarding them to other functions when possible.
   * The functions forwarded here don't need to be implemented by each implementation.
   * For width=4 all functions are forwarded and for width=8 all but loadU4NOffset are forwarded.
   */
  #if GMX_SIMD_HAVE_FLOAT
-#if GMX_SIMD_FLOAT_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
-#elif GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 1
+#if GMX_SIMD_FLOAT_WIDTH < 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_FLOAT)
+#elif GMX_SIMD_FLOAT_WIDTH == 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT GMX_SIMD_HAVE_LOADU
  //For GMX_SIMD_FLOAT_WIDTH>4 it is the reponsibility of the implementation to set
  //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
  #endif
  
-#if GMX_SIMD_FLOAT_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-static inline SimdFloat gmx_simdcall
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#if GMX_SIMD_FLOAT_WIDTH < 4
+using Simd4NFloat = Simd4Float;
+#define GMX_SIMD4N_FLOAT_WIDTH 4
+#else
+using Simd4NFloat = SimdFloat;
+#define GMX_SIMD4N_FLOAT_WIDTH GMX_SIMD_FLOAT_WIDTH
+#endif
+
+#if GMX_SIMD_FLOAT_WIDTH <= 4
+static inline Simd4NFloat gmx_simdcall
  loadUNDuplicate4(const float* f)
  {
-    return SimdFloat(*f);
+    return Simd4NFloat(*f);
  }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
  load4DuplicateN(const float* f)
  {
-    return load<SimdFloat>(f);
+    return load<Simd4NFloat>(f);
  }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
  loadU4NOffset(const float* f, int)
  {
-    return loadU<SimdFloat>(f);
+    return loadU<Simd4NFloat>(f);
  }
-#elif GMX_SIMD_FLOAT_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_FLOAT && GMX_SIMD_HAVE_LOADU
-static inline SimdFloat gmx_simdcall
+#elif GMX_SIMD_FLOAT_WIDTH == 8
+static inline Simd4NFloat gmx_simdcall
  loadUNDuplicate4(const float* f)
  {
      return loadU1DualHsimd(f);
  }
-static inline SimdFloat gmx_simdcall
+static inline Simd4NFloat gmx_simdcall
  load4DuplicateN(const float* f)
  {
      return loadDuplicateHsimd(f);
  }
  #endif
-#else //GMX_SIMD_HAVE_FLOAT
+#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
+#else  //GMX_SIMD_HAVE_FLOAT
  #define GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT 0
  #endif
  
  #if GMX_SIMD_HAVE_DOUBLE
-#if GMX_SIMD_DOUBLE_WIDTH < 4 || !GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
-#elif GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 1
+#if GMX_SIMD_DOUBLE_WIDTH < 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE (GMX_SIMD_HAVE_LOADU && GMX_SIMD4_HAVE_DOUBLE)
+#elif GMX_SIMD_DOUBLE_WIDTH == 4
+#define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE GMX_SIMD_HAVE_LOADU
  //For GMX_SIMD_DOUBLE_WIDTH>4 it is the reponsibility of the implementation to set
  //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
  #endif
  
-#if GMX_SIMD_DOUBLE_WIDTH == 4 && GMX_SIMD_HAVE_LOADU
-static inline SimdDouble gmx_simdcall
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#if GMX_SIMD_DOUBLE_WIDTH < 4
+using Simd4NDouble = Simd4Double;
+#define GMX_SIMD4N_DOUBLE_WIDTH 4
+#else
+using Simd4NDouble = SimdDouble;
+#define GMX_SIMD4N_DOUBLE_WIDTH GMX_SIMD_DOUBLE_WIDTH
+#endif
+
+#if GMX_SIMD_DOUBLE_WIDTH <= 4
+static inline Simd4NDouble gmx_simdcall
  loadUNDuplicate4(const double* f)
  {
-    return SimdDouble(*f);
+    return Simd4NDouble(*f);
  }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
  load4DuplicateN(const double* f)
  {
-    return load<SimdDouble>(f);
+    return load<Simd4NDouble>(f);
  }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
  loadU4NOffset(const double* f, int)
  {
-    return loadU<SimdDouble>(f);
+    return loadU<Simd4NDouble>(f);
  }
-#elif GMX_SIMD_DOUBLE_WIDTH == 8 && GMX_SIMD_HAVE_HSIMD_UTIL_DOUBLE && GMX_SIMD_HAVE_LOADU
-static inline SimdDouble gmx_simdcall
+#elif GMX_SIMD_DOUBLE_WIDTH == 8
+static inline Simd4NDouble gmx_simdcall
  loadUNDuplicate4(const double* f)
  {
      return loadU1DualHsimd(f);
  }
-static inline SimdDouble gmx_simdcall
+static inline Simd4NDouble gmx_simdcall
  load4DuplicateN(const double* f)
  {
      return loadDuplicateHsimd(f);
  }
  #endif
-#else //GMX_SIMD_HAVE_DOUBLE
+#endif //GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE
+#else  //GMX_SIMD_HAVE_DOUBLE
  #define GMX_SIMD_HAVE_4NSIMD_UTIL_DOUBLE 0
  #endif
  
@@ -664,6 +731,16 @@ load4DuplicateN(const double* f)
  #define GMX_SIMD_HAVE_4NSIMD_UTIL_REAL GMX_SIMD_HAVE_4NSIMD_UTIL_FLOAT
  #endif
  
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+#if GMX_DOUBLE
+using Simd4NReal = Simd4NDouble;
+#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_DOUBLE_WIDTH
+#else
+using Simd4NReal = Simd4NFloat;
+#define GMX_SIMD4N_REAL_WIDTH GMX_SIMD4N_FLOAT_WIDTH
+#endif
+#endif
+
  //! \}  end of name-group proxy objects
  
  }      // namespace gmx
diff --git a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp

index 78338938a1b6b4caab7b823fb295f626a69da8a2..a43dfc5d7f9fb589181123025e65da4a069a988c 100644 (file)
--- a/src/gromacs/simd/tests/simd_floatingpoint_util.cpp
+++ b/src/gromacs/simd/tests/simd_floatingpoint_util.cpp
@@ -919,11 +919,12 @@ TEST_F(SimdFloatingpointUtilTest, reduceIncr4SumHsimd)
  
  #endif      // GMX_SIMD_HAVE_HSIMD_UTIL_REAL
  
-#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL
+//Test Currently doesn't work for GMX_SIMD_REAL_WIDTH<4. Should be fixed by having GMX_EXPECT_SIMD_REAL_EQ which works for both Simd and Simd4
+#if GMX_SIMD_HAVE_4NSIMD_UTIL_REAL && GMX_SIMD_REAL_WIDTH >= 4
  
  TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
  {
-    SimdReal        v0, v1;
+    Simd4NReal      v0, v1;
      int             i;
      real            data[GMX_SIMD_REAL_WIDTH/4];
      std::iota(data, data+GMX_SIMD_REAL_WIDTH/4, 1);
@@ -936,7 +937,7 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
          val0_[i*4] = val0_[i*4+1] = val0_[i*4+2] = val0_[i*4+3] = data[i];
      }
  
-    v0 = load<SimdReal>(val0_);
+    v0 = load<Simd4NReal>(val0_);
      v1 = loadUNDuplicate4(data);
  
      GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
@@ -944,9 +945,9 @@ TEST_F(SimdFloatingpointUtilTest, loadUNDuplicate4)
  
  TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
  {
-    SimdReal        v0, v1;
-    int             i;
-    real            data[4] = { 1, 2, 3, 4};
+    Simd4NReal        v0, v1;
+    int               i;
+    real              data[4] = { 1, 2, 3, 4};
  
      for (i = 0; i < GMX_SIMD_REAL_WIDTH / 4; i++)
      {
@@ -956,7 +957,7 @@ TEST_F(SimdFloatingpointUtilTest, load4DuplicateN)
          val0_[i*4+3] = data[3];
      }
  
-    v0 = load<SimdReal>(val0_);
+    v0 = load<Simd4NReal>(val0_);
      v1 = load4DuplicateN(val0_);
  
      GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
@@ -977,8 +978,8 @@ TEST_F(SimdFloatingpointUtilTest, loadU4NOffset)
          val0_[i*4+3] = data[3+offset*i];
      }
  
-    const SimdReal v0 = load<SimdReal>(val0_);
-    const SimdReal v1 = loadU4NOffset(data, offset);
+    const Simd4NReal v0 = load<Simd4NReal>(val0_);
+    const Simd4NReal v1 = loadU4NOffset(data, offset);
  
      GMX_EXPECT_SIMD_REAL_EQ(v0, v1);
  }
diff --git a/src/gromacs/simd/tests/simd_memory.cpp b/src/gromacs/simd/tests/simd_memory.cpp

index 55b1893adf2832e0500d7b5a9f9f87924a23f5c3..24aab65dbc8830bc1a4d0f345318c750edbfb724 100644 (file)
--- a/src/gromacs/simd/tests/simd_memory.cpp
+++ b/src/gromacs/simd/tests/simd_memory.cpp
@@ -79,8 +79,8 @@ class ArrayRefTest : public test::SimdTest
          using ArrayRefType = TypeParam;
          using PointerType  = typename ArrayRefType::pointer;
          using ValueType    = typename ArrayRefType::value_type;
-        using ElementType  = typename std::remove_const<typename SimdTraits<ValueType>::type>::type;
-        static constexpr int width = SimdTraits<ValueType>::width;
+        using ElementType  = typename std::remove_const<typename gmx::internal::SimdTraits<ValueType>::type>::type;
+        static constexpr int width = gmx::internal::SimdTraits<ValueType>::width;
  
          /*! \brief Run the same tests all the time
           *
author	Roland Schulz <roland.schulz@intel.com>
	Mon, 11 Dec 2017 11:07:51 +0000 (03:07 -0800)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sun, 17 Dec 2017 07:38:29 +0000 (08:38 +0100)
docs/doxygen/lib/simd.md		patch \| blob \| history
src/gromacs/ewald/pme-gather.cpp		patch \| blob \| history
src/gromacs/simd/simd.h		patch \| blob \| history
src/gromacs/simd/tests/simd_floatingpoint_util.cpp		patch \| blob \| history
src/gromacs/simd/tests/simd_memory.cpp		patch \| blob \| history