Merge release-4-6 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / nbnxn_kernel_simd_4xn_outer.h
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

index 91cae9439ca8f586e3d539651a8c40876bbdc9ca..e5f4ae0fa8a520f98ee90e7c14eba96eda9e9215 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
@@ -35,7 +35,13 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
  #include "gmx_simd_macros.h"
  
  #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
@@ -43,35 +49,42 @@
  #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  #define UNROLLJ    GMX_SIMD_WIDTH_HERE
  
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE     4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
-#define STRIDE     8
+/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+#define STRIDE     GMX_SIMD_WIDTH_HERE
+#else
+#define STRIDE     UNROLLI
  #endif
  
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
+#if GMX_SIMD_WIDTH_HERE == 2
+#define SUM_SIMD(x)  (x[0]+x[1])
+#else
+#if GMX_SIMD_WIDTH_HERE == 4
+#define SUM_SIMD(x)  SUM_SIMD4(x)
  #else
-/* double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x)  (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#error "unsupported kernel configuration"
+#endif
  #endif
  #endif
  
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x8 kernel */
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+
+/* Decide if we should use the FDV0 table layout */
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE/2 == 4
  #define TAB_FDV0
+#endif
  #else
-/* double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
  #endif
  #endif
  
+
  #define SIMD_MASK_ALL   0xffffffff
  
  #include "nbnxn_kernel_simd_utils.h"
@@ -181,143 +194,118 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
      real       *vctp[UNROLLI];
  #endif
  
-    gmx_mm_pr  shX_SSE;
-    gmx_mm_pr  shY_SSE;
-    gmx_mm_pr  shZ_SSE;
-    gmx_mm_pr  ix_SSE0, iy_SSE0, iz_SSE0;
-    gmx_mm_pr  ix_SSE1, iy_SSE1, iz_SSE1;
-    gmx_mm_pr  ix_SSE2, iy_SSE2, iz_SSE2;
-    gmx_mm_pr  ix_SSE3, iy_SSE3, iz_SSE3;
-    gmx_mm_pr  fix_SSE0, fiy_SSE0, fiz_SSE0;
-    gmx_mm_pr  fix_SSE1, fiy_SSE1, fiz_SSE1;
-    gmx_mm_pr  fix_SSE2, fiy_SSE2, fiz_SSE2;
-    gmx_mm_pr  fix_SSE3, fiy_SSE3, fiz_SSE3;
+    gmx_mm_pr  shX_S;
+    gmx_mm_pr  shY_S;
+    gmx_mm_pr  shZ_S;
+    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
+    gmx_mm_pr  ix_S1, iy_S1, iz_S1;
+    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
+    gmx_mm_pr  ix_S3, iy_S3, iz_S3;
+    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
+    gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
+    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
+    gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
  #if UNROLLJ >= 4
  #ifndef GMX_DOUBLE
-    __m128     fix_SSE, fiy_SSE, fiz_SSE;
+    __m128     fix_S, fiy_S, fiz_S;
  #else
-    __m256d    fix_SSE, fiy_SSE, fiz_SSE;
+    __m256d    fix_S, fiy_S, fiz_S;
  #endif
  #else
-    __m128d    fix0_SSE, fiy0_SSE, fiz0_SSE;
-    __m128d    fix2_SSE, fiy2_SSE, fiz2_SSE;
+    __m128d    fix0_S, fiy0_S, fiz0_S;
+    __m128d    fix2_S, fiy2_S, fiz2_S;
  #endif
  
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-    __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
-    __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
-    __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
-    __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
-#else
-    /* For double precision we need to set two 32bit ints for one double */
-    __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
-    __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
-    __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
-    __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
-#endif
-#endif
-#ifdef GMX_MM256_HERE
-    /* AVX: use floating point masks, as there are no integer instructions */
-#ifndef GMX_DOUBLE
-    gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
-    gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+    gmx_mm_pr diag_jmi_S;
+#if UNROLLI == UNROLLJ
+    gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
  #else
-    /* There is no 256-bit int to double conversion, so we use float here */
-    __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
-    __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
-    __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
-    __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
-#endif
+    gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
+    gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
  #endif
  
-    gmx_mm_pr diag_jmi_SSE;
-#if UNROLLI == UNROLLJ
-    gmx_mm_pr diag_SSE0, diag_SSE1, diag_SSE2, diag_SSE3;
+#ifdef gmx_checkbitmask_epi32
+    gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
  #else
-    gmx_mm_pr diag0_SSE0, diag0_SSE1, diag0_SSE2, diag0_SSE3;
-    gmx_mm_pr diag1_SSE0, diag1_SSE1, diag1_SSE2, diag1_SSE3;
+    gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
  #endif
  
-#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
-    __m128i    zeroi_SSE = _mm_setzero_si128();
-#endif
-    gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
+    gmx_mm_pr  zero_S = gmx_set1_pr(0);
  
-    gmx_mm_pr  one_SSE = gmx_set1_pr(1.0);
-    gmx_mm_pr  iq_SSE0 = gmx_setzero_pr();
-    gmx_mm_pr  iq_SSE1 = gmx_setzero_pr();
-    gmx_mm_pr  iq_SSE2 = gmx_setzero_pr();
-    gmx_mm_pr  iq_SSE3 = gmx_setzero_pr();
-    gmx_mm_pr  mrc_3_SSE;
+    gmx_mm_pr  one_S  = gmx_set1_pr(1.0);
+    gmx_mm_pr  iq_S0  = gmx_setzero_pr();
+    gmx_mm_pr  iq_S1  = gmx_setzero_pr();
+    gmx_mm_pr  iq_S2  = gmx_setzero_pr();
+    gmx_mm_pr  iq_S3  = gmx_setzero_pr();
+    gmx_mm_pr  mrc_3_S;
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  hrc_3_SSE, moh_rc_SSE;
+    gmx_mm_pr  hrc_3_S, moh_rc_S;
  #endif
  
  #ifdef CALC_COUL_TAB
      /* Coulomb table variables */
-    gmx_mm_pr   invtsp_SSE;
+    gmx_mm_pr   invtsp_S;
      const real *tab_coul_F;
  #ifndef TAB_FDV0
      const real *tab_coul_V;
  #endif
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
      int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
      int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
      int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
      int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
  #endif
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  mhalfsp_SSE;
+    gmx_mm_pr  mhalfsp_S;
  #endif
  #endif
  
  #ifdef CALC_COUL_EWALD
-    gmx_mm_pr beta2_SSE, beta_SSE;
+    gmx_mm_pr beta2_S, beta_S;
  #endif
  
  #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
-    gmx_mm_pr  sh_ewald_SSE;
+    gmx_mm_pr  sh_ewald_S;
  #endif
  
  #ifdef LJ_COMB_LB
      const real *ljc;
  
-    gmx_mm_pr   hsig_i_SSE0, seps_i_SSE0;
-    gmx_mm_pr   hsig_i_SSE1, seps_i_SSE1;
-    gmx_mm_pr   hsig_i_SSE2, seps_i_SSE2;
-    gmx_mm_pr   hsig_i_SSE3, seps_i_SSE3;
+    gmx_mm_pr   hsig_i_S0, seps_i_S0;
+    gmx_mm_pr   hsig_i_S1, seps_i_S1;
+    gmx_mm_pr   hsig_i_S2, seps_i_S2;
+    gmx_mm_pr   hsig_i_S3, seps_i_S3;
  #else
  #ifdef FIX_LJ_C
      real        pvdw_array[2*UNROLLI*UNROLLJ+3];
      real       *pvdw_c6, *pvdw_c12;
-    gmx_mm_pr   c6_SSE0, c12_SSE0;
-    gmx_mm_pr   c6_SSE1, c12_SSE1;
-    gmx_mm_pr   c6_SSE2, c12_SSE2;
-    gmx_mm_pr   c6_SSE3, c12_SSE3;
+    gmx_mm_pr   c6_S0, c12_S0;
+    gmx_mm_pr   c6_S1, c12_S1;
+    gmx_mm_pr   c6_S2, c12_S2;
+    gmx_mm_pr   c6_S3, c12_S3;
  #endif
  
  #ifdef LJ_COMB_GEOM
      const real *ljc;
  
-    gmx_mm_pr   c6s_SSE0, c12s_SSE0;
-    gmx_mm_pr   c6s_SSE1, c12s_SSE1;
-    gmx_mm_pr   c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
-    gmx_mm_pr   c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
+    gmx_mm_pr   c6s_S0, c12s_S0;
+    gmx_mm_pr   c6s_S1, c12s_S1;
+    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
  #endif
  #endif /* LJ_COMB_LB */
  
-    gmx_mm_pr  vctotSSE, VvdwtotSSE;
-    gmx_mm_pr  sixthSSE, twelvethSSE;
+    gmx_mm_pr  vctot_S, Vvdwtot_S;
+    gmx_mm_pr  sixth_S, twelveth_S;
  
-    gmx_mm_pr  avoid_sing_SSE;
-    gmx_mm_pr  rc2_SSE;
+    gmx_mm_pr  avoid_sing_S;
+    gmx_mm_pr  rc2_S;
  #ifdef VDW_CUTOFF_CHECK
-    gmx_mm_pr  rcvdw2_SSE;
+    gmx_mm_pr  rcvdw2_S;
  #endif
  
  #ifdef CALC_ENERGIES
-    gmx_mm_pr  sh_invrc6_SSE, sh_invrc12_SSE;
+    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
  
      /* cppcheck-suppress unassignedVariable */
      real       tmpsum_array[15], *tmpsum;
@@ -348,54 +336,67 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #endif
  
      /* Load j-i for the first i */
-    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+    diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
      /* Generate all the diagonal masks as comparison results */
  #if UNROLLI == UNROLLJ
-    diag_SSE0    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag_SSE1    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag_SSE2    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag_SSE3    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+    diag_S0    = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag_S1    = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag_S2    = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag_S3    = gmx_cmplt_pr(zero_S, diag_jmi_S);
  #else
  #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
-    diag0_SSE0   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag0_SSE1   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag0_SSE2   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag0_SSE3   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
+    diag0_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag0_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag0_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag0_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
  
  #if UNROLLI == 2*UNROLLJ
      /* Load j-i for the second half of the j-cluster */
-    diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+    diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
  #endif
  
-    diag1_SSE0   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag1_SSE1   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag1_SSE2   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
-    diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
-    diag1_SSE3   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+    diag1_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag1_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag1_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
+    diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+    diag1_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
  #endif
  #endif
  
+    /* Load masks for topology exclusion masking */
+#ifdef gmx_checkbitmask_epi32
+    mask_S0    = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
+    mask_S1    = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
+    mask_S2    = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
+    mask_S3    = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
+#else
+    mask_S0    = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
+    mask_S1    = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
+    mask_S2    = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
+    mask_S3    = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
+#endif
+
  #ifdef CALC_COUL_TAB
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
      /* Generate aligned table index pointers */
-    ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
-    ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
-    ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
-    ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+    ti0 = gmx_simd_align_int(ti0_array);
+    ti1 = gmx_simd_align_int(ti1_array);
+    ti2 = gmx_simd_align_int(ti2_array);
+    ti3 = gmx_simd_align_int(ti3_array);
  #endif
  
-    invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
+    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
  #ifdef CALC_ENERGIES
-    mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
  #endif
  
  #ifdef TAB_FDV0
@@ -407,12 +408,12 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #endif /* CALC_COUL_TAB */
  
  #ifdef CALC_COUL_EWALD
-    beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
-    beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
+    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
  #endif
  
  #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
-    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
  #endif
  
      q                   = nbat->q;
@@ -421,39 +422,39 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
      shiftvec            = shift_vec[0];
      x                   = nbat->x;
  
-    avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
  
      /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
-    rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
  #ifdef VDW_CUTOFF_CHECK
-    rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
  #endif
  
  #ifdef CALC_ENERGIES
-    sixthSSE    = gmx_set1_pr(1.0/6.0);
-    twelvethSSE = gmx_set1_pr(1.0/12.0);
+    sixth_S      = gmx_set1_pr(1.0/6.0);
+    twelveth_S   = gmx_set1_pr(1.0/12.0);
  
-    sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
-    sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
+    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
  #endif
  
-    mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
  
  #ifdef CALC_ENERGIES
-    hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+    hrc_3_S  = gmx_set1_pr(ic->k_rf);
  
-    moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+    moh_rc_S = gmx_set1_pr(-ic->c_rf);
  #endif
  
  #ifdef CALC_ENERGIES
-    tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+    tmpsum   = gmx_simd_align_real(tmpsum_array);
  #endif
  #ifdef CALC_SHIFTFORCES
-    shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+    shf      = gmx_simd_align_real(shf_array);
  #endif
  
  #ifdef FIX_LJ_C
-    pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+    pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
      pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
  
      for (jp = 0; jp < UNROLLJ; jp++)
@@ -468,15 +469,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
          pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
          pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
      }
-    c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
-    c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
-    c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
-    c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
-    c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
-    c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
-    c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
-    c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
  #endif /* FIX_LJ_C */
  
  #ifdef ENERGY_GROUPS
@@ -503,9 +504,9 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
-        shX_SSE = gmx_load1_pr(shiftvec+ish3);
-        shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
-        shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+        shX_S = gmx_load1_pr(shiftvec+ish3);
+        shY_S = gmx_load1_pr(shiftvec+ish3+1);
+        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
  
  #if UNROLLJ <= 4
          sci              = ci*STRIDE;
@@ -588,51 +589,51 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
          /* Load i atom data */
          sciy             = scix + STRIDE;
          sciz             = sciy + STRIDE;
-        ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_SSE);
-        ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_SSE);
-        ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_SSE);
-        ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_SSE);
-        iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_SSE);
-        iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_SSE);
-        iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_SSE);
-        iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_SSE);
-        iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_SSE);
-        iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_SSE);
-        iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_SSE);
-        iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_SSE);
+        ix_S0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
+        ix_S1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
+        ix_S2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
+        ix_S3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
+        iy_S0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
+        iy_S1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
+        iy_S2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
+        iy_S3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
+        iz_S0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
+        iz_S1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
+        iz_S2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
+        iz_S3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
  
          if (do_coul)
          {
-            iq_SSE0      = gmx_set1_pr(facel*q[sci]);
-            iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
-            iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
-            iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
+            iq_S0      = gmx_set1_pr(facel*q[sci]);
+            iq_S1      = gmx_set1_pr(facel*q[sci+1]);
+            iq_S2      = gmx_set1_pr(facel*q[sci+2]);
+            iq_S3      = gmx_set1_pr(facel*q[sci+3]);
          }
  
  #ifdef LJ_COMB_LB
-        hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
-        hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
-        hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
-        hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
-        seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
-        seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
-        seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
+        hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
+        hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
+        hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
+        hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
+        seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
+        seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
+        seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
  #else
  #ifdef LJ_COMB_GEOM
-        c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
-        c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
+        c6s_S0         = gmx_load1_pr(ljc+sci2+0);
+        c6s_S1         = gmx_load1_pr(ljc+sci2+1);
          if (!half_LJ)
          {
-            c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
-            c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
+            c6s_S2     = gmx_load1_pr(ljc+sci2+2);
+            c6s_S3     = gmx_load1_pr(ljc+sci2+3);
          }
-        c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
-        c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
+        c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
+        c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
          if (!half_LJ)
          {
-            c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
-            c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
+            c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
+            c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
          }
  #else
          nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
@@ -646,22 +647,22 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #endif
  
          /* Zero the potential energy for this list */
-        VvdwtotSSE       = gmx_setzero_pr();
-        vctotSSE         = gmx_setzero_pr();
+        Vvdwtot_S        = gmx_setzero_pr();
+        vctot_S          = gmx_setzero_pr();
  
          /* Clear i atom forces */
-        fix_SSE0           = gmx_setzero_pr();
-        fix_SSE1           = gmx_setzero_pr();
-        fix_SSE2           = gmx_setzero_pr();
-        fix_SSE3           = gmx_setzero_pr();
-        fiy_SSE0           = gmx_setzero_pr();
-        fiy_SSE1           = gmx_setzero_pr();
-        fiy_SSE2           = gmx_setzero_pr();
-        fiy_SSE3           = gmx_setzero_pr();
-        fiz_SSE0           = gmx_setzero_pr();
-        fiz_SSE1           = gmx_setzero_pr();
-        fiz_SSE2           = gmx_setzero_pr();
-        fiz_SSE3           = gmx_setzero_pr();
+        fix_S0           = gmx_setzero_pr();
+        fix_S1           = gmx_setzero_pr();
+        fix_S2           = gmx_setzero_pr();
+        fix_S3           = gmx_setzero_pr();
+        fiy_S0           = gmx_setzero_pr();
+        fiy_S1           = gmx_setzero_pr();
+        fiy_S2           = gmx_setzero_pr();
+        fiy_S3           = gmx_setzero_pr();
+        fiz_S0           = gmx_setzero_pr();
+        fiz_S1           = gmx_setzero_pr();
+        fiz_S2           = gmx_setzero_pr();
+        fiz_S3           = gmx_setzero_pr();
  
          cjind = cjind0;
  
@@ -721,53 +722,53 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
          /* Add accumulated i-forces to the force array */
  #if UNROLLJ >= 4
  #ifndef GMX_DOUBLE
-#define gmx_load_ps4  _mm_load_ps
-#define gmx_store_ps4 _mm_store_ps
-#define gmx_add_ps4   _mm_add_ps
+#define gmx_load_pr4  _mm_load_ps
+#define gmx_store_pr4 _mm_store_ps
+#define gmx_add_pr4   _mm_add_ps
  #else
-#define gmx_load_ps4  _mm256_load_pd
-#define gmx_store_ps4 _mm256_store_pd
-#define gmx_add_ps4   _mm256_add_pd
+#define gmx_load_pr4  _mm256_load_pd
+#define gmx_store_pr4 _mm256_store_pd
+#define gmx_add_pr4   _mm256_add_pd
  #endif
-        GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3, fix_SSE);
-        gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+        GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
+        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
  
-        GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3, fiy_SSE);
-        gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+        GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
+        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
  
-        GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3, fiz_SSE);
-        gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+        GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
+        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
  
  #ifdef CALC_SHIFTFORCES
-        gmx_store_ps4(shf, fix_SSE);
+        gmx_store_pr4(shf, fix_S);
          fshift[ish3+0] += SUM_SIMD4(shf);
-        gmx_store_ps4(shf, fiy_SSE);
+        gmx_store_pr4(shf, fiy_S);
          fshift[ish3+1] += SUM_SIMD4(shf);
-        gmx_store_ps4(shf, fiz_SSE);
+        gmx_store_pr4(shf, fiz_S);
          fshift[ish3+2] += SUM_SIMD4(shf);
  #endif
  #else
-        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
-        _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
-        GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
-        _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
+        _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
+        _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
  
-        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
-        _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
-        GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
-        _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
+        _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
+        _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
  
-        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
-        _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
-        GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
-        _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
+        _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
+        GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
+        _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
  
  #ifdef CALC_SHIFTFORCES
-        _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
+        _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
          fshift[ish3+0] += shf[0] + shf[1];
-        _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
+        _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
          fshift[ish3+1] += shf[0] + shf[1];
-        _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
+        _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
          fshift[ish3+2] += shf[0] + shf[1];
  #endif
  #endif
@@ -775,11 +776,11 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #ifdef CALC_ENERGIES
          if (do_coul)
          {
-            gmx_store_pr(tmpsum, vctotSSE);
+            gmx_store_pr(tmpsum, vctot_S);
              *Vc += SUM_SIMD(tmpsum);
          }
  
-        gmx_store_pr(tmpsum, VvdwtotSSE);
+        gmx_store_pr(tmpsum, Vvdwtot_S);
          *Vvdw += SUM_SIMD(tmpsum);
  #endif
  
@@ -791,9 +792,10 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #endif
  }
  
-#undef gmx_load_ps4
-#undef gmx_store_ps4
-#undef gmx_store_ps4
+
+#undef gmx_load_pr4
+#undef gmx_store_pr4
+#undef gmx_store_pr4
  
  #undef CALC_SHIFTFORCES
  
@@ -802,3 +804,5 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
  #undef STRIDE
  #undef TAB_FDV0
  #undef NBFP_STRIDE
+
+#undef GMX_USE_HALF_WIDTH_SIMD_HERE