implemented plain-C SIMD macros for reference

[alexxy/gromacs.git] / src / mdlib / pme.c
diff --git a/src/mdlib/pme.c b/src/mdlib/pme.c

index 520aa7368aa98d3c4cbf5455783e1afdb97b61ac..df8b463d5d9e124507ecd0ef075f13d7f3afc3fd 100644 (file)
--- a/src/mdlib/pme.c
+++ b/src/mdlib/pme.c
@@ -92,12 +92,21 @@
  #include "gmx_cyclecounter.h"
  #include "gmx_omp.h"
  
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
  
-#include "gmx_x86_simd_single.h"
+/* Include the SIMD macro file and then check for support */
+#include "gmx_simd_macros.h"
+#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
+/* Turn on SIMD intrinsics for PME solve */
+#define PME_SIMD
+#endif
  
-#define PME_SSE
+/* SIMD spread+gather only in single precision with SSE2 or higher available.
+ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
+ * complicated, as we use unaligned and/or 4-wide only loads.
+ */
+#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+#define PME_SSE_SPREAD_GATHER
+#include <emmintrin.h>
  /* Some old AMD processors could have problems with unaligned loads+stores */
  #ifndef GMX_FAHCORE
  #define PME_SSE_UNALIGNED
@@ -231,7 +240,7 @@ typedef struct {
  
  
  typedef struct {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
      /* Masks for SSE aligned spreading and gathering */
      __m128 mask_SSE0[6], mask_SSE1[6];
  #else
@@ -1544,7 +1553,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
              switch (order)
              {
                  case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
  #ifdef PME_SSE_UNALIGNED
  #define PME_SPREAD_SSE_ORDER4
  #else
@@ -1557,7 +1566,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
  #endif
                      break;
                  case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
  #define PME_SPREAD_SSE_ALIGNED
  #define PME_ORDER 5
  #include "pme_sse_single.h"
@@ -1575,7 +1584,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
  
  static void set_grid_alignment(int *pmegrid_nz, int pme_order)
  {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
      if (pme_order == 5
  #ifndef PME_SSE_UNALIGNED
          || pme_order == 4
@@ -1590,7 +1599,7 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order)
  
  static void set_gridsize_alignment(int *gridsize, int pme_order)
  {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
  #ifndef PME_SSE_UNALIGNED
      if (pme_order == 4)
      {
@@ -1867,15 +1876,21 @@ static void realloc_work(pme_work_t *work, int nkx)
          srenew(work->mhy, work->nalloc);
          srenew(work->mhz, work->nalloc);
          srenew(work->m2, work->nalloc);
-        /* Allocate an aligned pointer for SSE operations, including 3 extra
-         * elements at the end since SSE operates on 4 elements at a time.
+        /* Allocate an aligned pointer for SIMD operations, including extra
+         * elements at the end for padding.
           */
+#ifdef PME_SIMD
+#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
+#else
+/* We can use any alignment, apart from 0, so we use 4 */
+#define ALIGN_HERE  4
+#endif
          sfree_aligned(work->denom);
          sfree_aligned(work->tmp1);
          sfree_aligned(work->eterm);
-        snew_aligned(work->denom, work->nalloc+3, 16);
-        snew_aligned(work->tmp1, work->nalloc+3, 16);
-        snew_aligned(work->eterm, work->nalloc+3, 16);
+        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
          srenew(work->m2inv, work->nalloc);
      }
  }
@@ -1894,27 +1909,26 @@ static void free_work(pme_work_t *work)
  }
  
  
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
+#ifdef PME_SIMD
+/* Calculate exponentials through SIMD */
  inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
  {
      {
-        const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-        __m128 f_sse;
-        __m128 lu;
-        __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_mm_pr two = gmx_set1_pr(2.0);
+        gmx_mm_pr f_simd;
+        gmx_mm_pr lu;
+        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
          int kx;
-        f_sse = _mm_load1_ps(&f);
-        for (kx = 0; kx < end; kx += 4)
+        f_simd = gmx_load1_pr(&f);
+        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
          {
-            tmp_d1   = _mm_load_ps(d_aligned+kx);
-            lu       = _mm_rcp_ps(tmp_d1);
-            d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
-            tmp_r    = _mm_load_ps(r_aligned+kx);
-            tmp_r    = gmx_mm_exp_ps(tmp_r);
-            tmp_e    = _mm_mul_ps(f_sse, d_inv);
-            tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
-            _mm_store_ps(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_load_pr(d_aligned+kx);
+            d_inv    = gmx_inv_pr(tmp_d1);
+            tmp_r    = gmx_load_pr(r_aligned+kx);
+            tmp_r    = gmx_exp_pr(tmp_r);
+            tmp_e    = gmx_mul_pr(f_simd, d_inv);
+            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
+            gmx_store_pr(e_aligned+kx, tmp_e);
          }
      }
  }
@@ -2311,7 +2325,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
              switch (order)
              {
                  case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
  #ifdef PME_SSE_UNALIGNED
  #define PME_GATHER_F_SSE_ORDER4
  #else
@@ -2324,7 +2338,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
  #endif
                      break;
                  case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
  #define PME_GATHER_F_SSE_ALIGNED
  #define PME_ORDER 5
  #include "pme_sse_single.h"
@@ -3001,7 +3015,7 @@ static pme_spline_work_t *make_pme_spline_work(int order)
  {
      pme_spline_work_t *work;
  
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
      float  tmp[8];
      __m128 zero_SSE;
      int    of, i;