implemented plain-C SIMD macros for reference
[alexxy/gromacs.git] / src / mdlib / pme.c
index 520aa7368aa98d3c4cbf5455783e1afdb97b61ac..df8b463d5d9e124507ecd0ef075f13d7f3afc3fd 100644 (file)
 #include "gmx_cyclecounter.h"
 #include "gmx_omp.h"
 
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
 
-#include "gmx_x86_simd_single.h"
+/* Include the SIMD macro file and then check for support */
+#include "gmx_simd_macros.h"
+#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
+/* Turn on SIMD intrinsics for PME solve */
+#define PME_SIMD
+#endif
 
-#define PME_SSE
+/* SIMD spread+gather only in single precision with SSE2 or higher available.
+ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
+ * complicated, as we use unaligned and/or 4-wide only loads.
+ */
+#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+#define PME_SSE_SPREAD_GATHER
+#include <emmintrin.h>
 /* Some old AMD processors could have problems with unaligned loads+stores */
 #ifndef GMX_FAHCORE
 #define PME_SSE_UNALIGNED
@@ -231,7 +240,7 @@ typedef struct {
 
 
 typedef struct {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
     /* Masks for SSE aligned spreading and gathering */
     __m128 mask_SSE0[6], mask_SSE1[6];
 #else
@@ -1544,7 +1553,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
 #ifdef PME_SSE_UNALIGNED
 #define PME_SPREAD_SSE_ORDER4
 #else
@@ -1557,7 +1566,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
 #define PME_SPREAD_SSE_ALIGNED
 #define PME_ORDER 5
 #include "pme_sse_single.h"
@@ -1575,7 +1584,7 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 
 static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
     if (pme_order == 5
 #ifndef PME_SSE_UNALIGNED
         || pme_order == 4
@@ -1590,7 +1599,7 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 
 static void set_gridsize_alignment(int *gridsize, int pme_order)
 {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
 #ifndef PME_SSE_UNALIGNED
     if (pme_order == 4)
     {
@@ -1867,15 +1876,21 @@ static void realloc_work(pme_work_t *work, int nkx)
         srenew(work->mhy, work->nalloc);
         srenew(work->mhz, work->nalloc);
         srenew(work->m2, work->nalloc);
-        /* Allocate an aligned pointer for SSE operations, including 3 extra
-         * elements at the end since SSE operates on 4 elements at a time.
+        /* Allocate an aligned pointer for SIMD operations, including extra
+         * elements at the end for padding.
          */
+#ifdef PME_SIMD
+#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
+#else
+/* We can use any alignment, apart from 0, so we use 4 */
+#define ALIGN_HERE  4
+#endif
         sfree_aligned(work->denom);
         sfree_aligned(work->tmp1);
         sfree_aligned(work->eterm);
-        snew_aligned(work->denom, work->nalloc+3, 16);
-        snew_aligned(work->tmp1, work->nalloc+3, 16);
-        snew_aligned(work->eterm, work->nalloc+3, 16);
+        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
         srenew(work->m2inv, work->nalloc);
     }
 }
@@ -1894,27 +1909,26 @@ static void free_work(pme_work_t *work)
 }
 
 
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
+#ifdef PME_SIMD
+/* Calculate exponentials through SIMD */
 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 {
     {
-        const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-        __m128 f_sse;
-        __m128 lu;
-        __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_mm_pr two = gmx_set1_pr(2.0);
+        gmx_mm_pr f_simd;
+        gmx_mm_pr lu;
+        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
         int kx;
-        f_sse = _mm_load1_ps(&f);
-        for (kx = 0; kx < end; kx += 4)
+        f_simd = gmx_load1_pr(&f);
+        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
         {
-            tmp_d1   = _mm_load_ps(d_aligned+kx);
-            lu       = _mm_rcp_ps(tmp_d1);
-            d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
-            tmp_r    = _mm_load_ps(r_aligned+kx);
-            tmp_r    = gmx_mm_exp_ps(tmp_r);
-            tmp_e    = _mm_mul_ps(f_sse, d_inv);
-            tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
-            _mm_store_ps(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_load_pr(d_aligned+kx);
+            d_inv    = gmx_inv_pr(tmp_d1);
+            tmp_r    = gmx_load_pr(r_aligned+kx);
+            tmp_r    = gmx_exp_pr(tmp_r);
+            tmp_e    = gmx_mul_pr(f_simd, d_inv);
+            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
+            gmx_store_pr(e_aligned+kx, tmp_e);
         }
     }
 }
@@ -2311,7 +2325,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
 #ifdef PME_SSE_UNALIGNED
 #define PME_GATHER_F_SSE_ORDER4
 #else
@@ -2324,7 +2338,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
 #define PME_GATHER_F_SSE_ALIGNED
 #define PME_ORDER 5
 #include "pme_sse_single.h"
@@ -3001,7 +3015,7 @@ static pme_spline_work_t *make_pme_spline_work(int order)
 {
     pme_spline_work_t *work;
 
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
     float  tmp[8];
     __m128 zero_SSE;
     int    of, i;