#include "gmx_cyclecounter.h"
#include "gmx_omp.h"
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-#include "gmx_x86_simd_single.h"
+/* Include the SIMD macro file and then check for support */
+#include "gmx_simd_macros.h"
+#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
+/* Turn on SIMD intrinsics for PME solve */
+#define PME_SIMD
+#endif
-#define PME_SSE
+/* SIMD spread+gather only in single precision with SSE2 or higher available.
+ * We might want to switch to use gmx_simd_macros.h, but this is somewhat
+ * complicated, as we use unaligned and/or 4-wide only loads.
+ */
+#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
+#define PME_SSE_SPREAD_GATHER
+#include <emmintrin.h>
/* Some old AMD processors could have problems with unaligned loads+stores */
#ifndef GMX_FAHCORE
#define PME_SSE_UNALIGNED
typedef struct {
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
/* Masks for SSE aligned spreading and gathering */
__m128 mask_SSE0[6], mask_SSE1[6];
#else
switch (order)
{
case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifdef PME_SSE_UNALIGNED
#define PME_SPREAD_SSE_ORDER4
#else
#endif
break;
case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#define PME_SPREAD_SSE_ALIGNED
#define PME_ORDER 5
#include "pme_sse_single.h"
static void set_grid_alignment(int *pmegrid_nz, int pme_order)
{
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
if (pme_order == 5
#ifndef PME_SSE_UNALIGNED
|| pme_order == 4
static void set_gridsize_alignment(int *gridsize, int pme_order)
{
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifndef PME_SSE_UNALIGNED
if (pme_order == 4)
{
srenew(work->mhy, work->nalloc);
srenew(work->mhz, work->nalloc);
srenew(work->m2, work->nalloc);
- /* Allocate an aligned pointer for SSE operations, including 3 extra
- * elements at the end since SSE operates on 4 elements at a time.
+ /* Allocate an aligned pointer for SIMD operations, including extra
+ * elements at the end for padding.
*/
+#ifdef PME_SIMD
+#define ALIGN_HERE GMX_SIMD_WIDTH_HERE
+#else
+/* We can use any alignment, apart from 0, so we use 4 */
+#define ALIGN_HERE 4
+#endif
sfree_aligned(work->denom);
sfree_aligned(work->tmp1);
sfree_aligned(work->eterm);
- snew_aligned(work->denom, work->nalloc+3, 16);
- snew_aligned(work->tmp1, work->nalloc+3, 16);
- snew_aligned(work->eterm, work->nalloc+3, 16);
+ snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ snew_aligned(work->tmp1, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+ snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
srenew(work->m2inv, work->nalloc);
}
}
}
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
+#ifdef PME_SIMD
+/* Calculate exponentials through SIMD */
inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
{
{
- const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
- __m128 f_sse;
- __m128 lu;
- __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+ const gmx_mm_pr two = gmx_set1_pr(2.0);
+ gmx_mm_pr f_simd;
+ gmx_mm_pr lu;
+ gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
int kx;
- f_sse = _mm_load1_ps(&f);
- for (kx = 0; kx < end; kx += 4)
+ f_simd = gmx_load1_pr(&f);
+ for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
{
- tmp_d1 = _mm_load_ps(d_aligned+kx);
- lu = _mm_rcp_ps(tmp_d1);
- d_inv = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
- tmp_r = _mm_load_ps(r_aligned+kx);
- tmp_r = gmx_mm_exp_ps(tmp_r);
- tmp_e = _mm_mul_ps(f_sse, d_inv);
- tmp_e = _mm_mul_ps(tmp_e, tmp_r);
- _mm_store_ps(e_aligned+kx, tmp_e);
+ tmp_d1 = gmx_load_pr(d_aligned+kx);
+ d_inv = gmx_inv_pr(tmp_d1);
+ tmp_r = gmx_load_pr(r_aligned+kx);
+ tmp_r = gmx_exp_pr(tmp_r);
+ tmp_e = gmx_mul_pr(f_simd, d_inv);
+ tmp_e = gmx_mul_pr(tmp_e, tmp_r);
+ gmx_store_pr(e_aligned+kx, tmp_e);
}
}
}
switch (order)
{
case 4:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#ifdef PME_SSE_UNALIGNED
#define PME_GATHER_F_SSE_ORDER4
#else
#endif
break;
case 5:
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
#define PME_GATHER_F_SSE_ALIGNED
#define PME_ORDER 5
#include "pme_sse_single.h"
{
pme_spline_work_t *work;
-#ifdef PME_SSE
+#ifdef PME_SSE_SPREAD_GATHER
float tmp[8];
__m128 zero_SSE;
int of, i;