This is in preparation for non-x86 SIMD acceleration.
Change-Id: Idc652236f4b2e0f48e759d579c798c1b0dc6944f
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/* The macros in this file are intended to be used for writing
+ * architecture independent SIMD intrinsics code.
+ * To support a new architecture, adding macros here should be (nearly)
+ * all that is needed.
+ */
+
/* Undefine all defines used below so we can include this file multiple times
* with different settings from the same source file.
*/
/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
-#undef GMX_X86_SIMD_WIDTH_HERE
+#undef GMX_SIMD_WIDTH_HERE
#undef gmx_epi32
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m128
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 2
+#define GMX_SIMD_WIDTH_HERE 2
#define gmx_mm_pr __m128d
#include "gmx_x86_simd_single.h"
-#define GMX_X86_SIMD_WIDTH_HERE 8
+#define GMX_SIMD_WIDTH_HERE 8
#define gmx_mm_pr __m256
#include "gmx_x86_simd_double.h"
-#define GMX_X86_SIMD_WIDTH_HERE 4
+#define GMX_SIMD_WIDTH_HERE 4
#define gmx_mm_pr __m256d
#else
#define GMX_MM128_HERE
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
int i,s;
gmx_mm_pr dest_SSE,src_SSE;
if (bDestSet)
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(dest+i);
for(s=0; s<nsrc; s++)
}
else
{
- for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
+ for(i=i0; i<i1; i+=GMX_SIMD_WIDTH_HERE)
{
dest_SSE = gmx_load_pr(src[0]+i);
for(s=1; s<nsrc; s++)
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
typedef struct nbnxn_x_ci_simd_4xn {
/* The i-cluster coordinates for simple search */
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
- const int simd_width = GMX_X86_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/4;
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/4;
int ng_p2,i,j,j0,j1,c,s;
ng_p2 = (1<<ng_2log);
*/
/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ (GMX_X86_SIMD_WIDTH_HERE/2)
+#define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
#if defined GMX_MM128_HERE || defined GMX_DOUBLE
#define STRIDE 4
const real *VSvdw,const real *VSc,
real *Vvdw,real *Vc)
{
- const int simd_width = GMX_X86_SIMD_WIDTH_HERE;
- const int unrollj_half = GMX_X86_SIMD_WIDTH_HERE/2;
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ const int unrollj_half = GMX_SIMD_WIDTH_HERE/2;
int ng_p2,i,j,j0,j1,c,s;
ng_p2 = (1<<ng_2log);
*/
/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
-#define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
+#define UNROLLJ GMX_SIMD_WIDTH_HERE
#if defined GMX_MM128_HERE || defined GMX_DOUBLE
#define STRIDE 4
{
gmx_mm_pr v_SSE;
- v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE);
- gmx_store_pr(v+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
+ v_SSE = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
+ gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE,gmx_add_pr(v_SSE,e_SSE));
}
}
-#if defined GMX_X86_AVX_256 && GMX_X86_SIMD_WIDTH_HERE == 8
+#if defined GMX_X86_AVX_256 && GMX_SIMD_WIDTH_HERE == 8
/* As add_ener_grp above, but for two groups of UNROLLJ/2 stored in
* a single SIMD register.
*/
{
gmx_mm_hpr v_SSE;
- v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
+ v_SSE = gmx_load_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE0));
}
for(jj=0; jj<(UNROLLJ/2); jj++)
{
gmx_mm_hpr v_SSE;
- v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2);
- gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_X86_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
+ v_SSE = gmx_load_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
+ gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2,gmx_add_hpr(v_SSE,e_SSE1));
}
}
#endif
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
-#if GMX_X86_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
-#define STRIDE_S (GMX_X86_SIMD_WIDTH_HERE/2)
+#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE/2)
#else
#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
#endif
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 2*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 2*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
#endif
#endif
-#include "gmx_x86_simd_macros.h"
+#include "gmx_simd_macros.h"
-#if GMX_X86_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
-#define STRIDE_S (GMX_X86_SIMD_WIDTH_HERE)
+#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
+#define STRIDE_S (GMX_SIMD_WIDTH_HERE)
#else
#define STRIDE_S NBNXN_CPU_CLUSTER_I_SIZE
#endif
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{
InRange = gmx_movemask_pr(wco_any_SSE);
- *ndistc += 4*GMX_X86_SIMD_WIDTH_HERE;
+ *ndistc += 4*GMX_SIMD_WIDTH_HERE;
}
if (!InRange)
{