* the research papers on the package. Check out http://www.gromacs.org.
*/
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+#if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
+#error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
+#endif
+
+#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
+#define GMX_USE_HALF_WIDTH_SIMD_HERE
+#endif
#include "gmx_simd_macros.h"
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
#define UNROLLJ GMX_SIMD_WIDTH_HERE
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE 4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
-#define STRIDE 8
+/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
+#if GMX_SIMD_WIDTH_HERE >= UNROLLI
+#define STRIDE GMX_SIMD_WIDTH_HERE
+#else
+#define STRIDE UNROLLI
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
+#if GMX_SIMD_WIDTH_HERE == 2
+#define SUM_SIMD(x) (x[0]+x[1])
+#else
+#if GMX_SIMD_WIDTH_HERE == 4
+#define SUM_SIMD(x) SUM_SIMD4(x)
#else
-/* double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
+#if GMX_SIMD_WIDTH_HERE == 8
+#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+#else
+#error "unsupported kernel configuration"
+#endif
#endif
#endif
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
-/* single precision 4x8 kernel */
-#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
+
+/* Decide if we should use the FDV0 table layout */
+#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
+/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
+#if GMX_SIMD_WIDTH_HERE/2 == 4
#define TAB_FDV0
+#endif
#else
-/* double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
#endif
#endif
+
#define SIMD_MASK_ALL 0xffffffff
#include "nbnxn_kernel_simd_utils.h"
real *vctp[UNROLLI];
#endif
- gmx_mm_pr shX_SSE;
- gmx_mm_pr shY_SSE;
- gmx_mm_pr shZ_SSE;
- gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
- gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
- gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
- gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
- gmx_mm_pr fix_SSE0, fiy_SSE0, fiz_SSE0;
- gmx_mm_pr fix_SSE1, fiy_SSE1, fiz_SSE1;
- gmx_mm_pr fix_SSE2, fiy_SSE2, fiz_SSE2;
- gmx_mm_pr fix_SSE3, fiy_SSE3, fiz_SSE3;
+ gmx_mm_pr shX_S;
+ gmx_mm_pr shY_S;
+ gmx_mm_pr shZ_S;
+ gmx_mm_pr ix_S0, iy_S0, iz_S0;
+ gmx_mm_pr ix_S1, iy_S1, iz_S1;
+ gmx_mm_pr ix_S2, iy_S2, iz_S2;
+ gmx_mm_pr ix_S3, iy_S3, iz_S3;
+ gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
+ gmx_mm_pr fix_S1, fiy_S1, fiz_S1;
+ gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
+ gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
#if UNROLLJ >= 4
#ifndef GMX_DOUBLE
- __m128 fix_SSE, fiy_SSE, fiz_SSE;
+ __m128 fix_S, fiy_S, fiz_S;
#else
- __m256d fix_SSE, fiy_SSE, fiz_SSE;
+ __m256d fix_S, fiy_S, fiz_S;
#endif
#else
- __m128d fix0_SSE, fiy0_SSE, fiz0_SSE;
- __m128d fix2_SSE, fiy2_SSE, fiz2_SSE;
+ __m128d fix0_S, fiy0_S, fiz0_S;
+ __m128d fix2_S, fiy2_S, fiz2_S;
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
- __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
- __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
- __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
- __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
-#else
- /* For double precision we need to set two 32bit ints for one double */
- __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
- __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
- __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
- __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
-#endif
-#endif
-#ifdef GMX_MM256_HERE
- /* AVX: use floating point masks, as there are no integer instructions */
-#ifndef GMX_DOUBLE
- gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
- gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
+ gmx_mm_pr diag_jmi_S;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
#else
- /* There is no 256-bit int to double conversion, so we use float here */
- __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
- __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
- __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
- __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
-#endif
+ gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
+ gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
#endif
- gmx_mm_pr diag_jmi_SSE;
-#if UNROLLI == UNROLLJ
- gmx_mm_pr diag_SSE0, diag_SSE1, diag_SSE2, diag_SSE3;
+#ifdef gmx_checkbitmask_epi32
+ gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
#else
- gmx_mm_pr diag0_SSE0, diag0_SSE1, diag0_SSE2, diag0_SSE3;
- gmx_mm_pr diag1_SSE0, diag1_SSE1, diag1_SSE2, diag1_SSE3;
+ gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
#endif
-#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
- __m128i zeroi_SSE = _mm_setzero_si128();
-#endif
- gmx_mm_pr zero_SSE = gmx_set1_pr(0);
+ gmx_mm_pr zero_S = gmx_set1_pr(0);
- gmx_mm_pr one_SSE = gmx_set1_pr(1.0);
- gmx_mm_pr iq_SSE0 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE1 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE2 = gmx_setzero_pr();
- gmx_mm_pr iq_SSE3 = gmx_setzero_pr();
- gmx_mm_pr mrc_3_SSE;
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S1 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr iq_S3 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
#ifdef CALC_ENERGIES
- gmx_mm_pr hrc_3_SSE, moh_rc_SSE;
+ gmx_mm_pr hrc_3_S, moh_rc_S;
#endif
#ifdef CALC_COUL_TAB
/* Coulomb table variables */
- gmx_mm_pr invtsp_SSE;
+ gmx_mm_pr invtsp_S;
const real *tab_coul_F;
#ifndef TAB_FDV0
const real *tab_coul_V;
#endif
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr mhalfsp_SSE;
+ gmx_mm_pr mhalfsp_S;
#endif
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr beta2_SSE, beta_SSE;
+ gmx_mm_pr beta2_S, beta_S;
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
- gmx_mm_pr sh_ewald_SSE;
+ gmx_mm_pr sh_ewald_S;
#endif
#ifdef LJ_COMB_LB
const real *ljc;
- gmx_mm_pr hsig_i_SSE0, seps_i_SSE0;
- gmx_mm_pr hsig_i_SSE1, seps_i_SSE1;
- gmx_mm_pr hsig_i_SSE2, seps_i_SSE2;
- gmx_mm_pr hsig_i_SSE3, seps_i_SSE3;
+ gmx_mm_pr hsig_i_S0, seps_i_S0;
+ gmx_mm_pr hsig_i_S1, seps_i_S1;
+ gmx_mm_pr hsig_i_S2, seps_i_S2;
+ gmx_mm_pr hsig_i_S3, seps_i_S3;
#else
#ifdef FIX_LJ_C
real pvdw_array[2*UNROLLI*UNROLLJ+3];
real *pvdw_c6, *pvdw_c12;
- gmx_mm_pr c6_SSE0, c12_SSE0;
- gmx_mm_pr c6_SSE1, c12_SSE1;
- gmx_mm_pr c6_SSE2, c12_SSE2;
- gmx_mm_pr c6_SSE3, c12_SSE3;
+ gmx_mm_pr c6_S0, c12_S0;
+ gmx_mm_pr c6_S1, c12_S1;
+ gmx_mm_pr c6_S2, c12_S2;
+ gmx_mm_pr c6_S3, c12_S3;
#endif
#ifdef LJ_COMB_GEOM
const real *ljc;
- gmx_mm_pr c6s_SSE0, c12s_SSE0;
- gmx_mm_pr c6s_SSE1, c12s_SSE1;
- gmx_mm_pr c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
- gmx_mm_pr c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S0, c12s_S0;
+ gmx_mm_pr c6s_S1, c12s_S1;
+ gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
+ gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
#endif
#endif /* LJ_COMB_LB */
- gmx_mm_pr vctotSSE, VvdwtotSSE;
- gmx_mm_pr sixthSSE, twelvethSSE;
+ gmx_mm_pr vctot_S, Vvdwtot_S;
+ gmx_mm_pr sixth_S, twelveth_S;
- gmx_mm_pr avoid_sing_SSE;
- gmx_mm_pr rc2_SSE;
+ gmx_mm_pr avoid_sing_S;
+ gmx_mm_pr rc2_S;
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pr rcvdw2_SSE;
+ gmx_mm_pr rcvdw2_S;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sh_invrc6_SSE, sh_invrc12_SSE;
+ gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
/* cppcheck-suppress unassignedVariable */
real tmpsum_array[15], *tmpsum;
#endif
/* Load j-i for the first i */
- diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+ diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
/* Generate all the diagonal masks as comparison results */
#if UNROLLI == UNROLLJ
- diag_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+ diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
#else
#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
- diag0_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag0_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
+ diag0_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag0_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
#if UNROLLI == 2*UNROLLJ
/* Load j-i for the second half of the j-cluster */
- diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+ diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
#endif
- diag1_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
- diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
- diag1_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
+ diag1_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
+ diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
+ diag1_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
#endif
#endif
+ /* Load masks for topology exclusion masking */
+#ifdef gmx_checkbitmask_epi32
+ mask_S0 = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S1 = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S2 = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
+ mask_S3 = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
+#else
+ mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
+ mask_S1 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
+ mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
+ mask_S3 = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
+#endif
+
#ifdef CALC_COUL_TAB
-#ifdef GMX_MM256_HERE
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
/* Generate aligned table index pointers */
- ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
- ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti0 = gmx_simd_align_int(ti0_array);
+ ti1 = gmx_simd_align_int(ti1_array);
+ ti2 = gmx_simd_align_int(ti2_array);
+ ti3 = gmx_simd_align_int(ti3_array);
#endif
- invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
+ invtsp_S = gmx_set1_pr(ic->tabq_scale);
#ifdef CALC_ENERGIES
- mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
+ mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
#endif
#ifdef TAB_FDV0
#endif /* CALC_COUL_TAB */
#ifdef CALC_COUL_EWALD
- beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
- beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
+ beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+ beta_S = gmx_set1_pr(ic->ewaldcoeff);
#endif
#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
- sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
+ sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
#endif
q = nbat->q;
shiftvec = shift_vec[0];
x = nbat->x;
- avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
+ avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
/* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
- rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
+ rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
#ifdef VDW_CUTOFF_CHECK
- rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
+ rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
#endif
#ifdef CALC_ENERGIES
- sixthSSE = gmx_set1_pr(1.0/6.0);
- twelvethSSE = gmx_set1_pr(1.0/12.0);
+ sixth_S = gmx_set1_pr(1.0/6.0);
+ twelveth_S = gmx_set1_pr(1.0/12.0);
- sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
- sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
+ sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
+ sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
#endif
- mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
+ mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
#ifdef CALC_ENERGIES
- hrc_3_SSE = gmx_set1_pr(ic->k_rf);
+ hrc_3_S = gmx_set1_pr(ic->k_rf);
- moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
+ moh_rc_S = gmx_set1_pr(-ic->c_rf);
#endif
#ifdef CALC_ENERGIES
- tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
+ tmpsum = gmx_simd_align_real(tmpsum_array);
#endif
#ifdef CALC_SHIFTFORCES
- shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
+ shf = gmx_simd_align_real(shf_array);
#endif
#ifdef FIX_LJ_C
- pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
+ pvdw_c6 = gmx_simd_align_real(pvdw_array+3);
pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
for (jp = 0; jp < UNROLLJ; jp++)
pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
}
- c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
- c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
- c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
- c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
-
- c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
- c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
- c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
- c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
+ c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
+ c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
+ c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
+ c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
+
+ c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
+ c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
+ c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
+ c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
#endif /* FIX_LJ_C */
#ifdef ENERGY_GROUPS
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
- shX_SSE = gmx_load1_pr(shiftvec+ish3);
- shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
- shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
+ shX_S = gmx_load1_pr(shiftvec+ish3);
+ shY_S = gmx_load1_pr(shiftvec+ish3+1);
+ shZ_S = gmx_load1_pr(shiftvec+ish3+2);
#if UNROLLJ <= 4
sci = ci*STRIDE;
/* Load i atom data */
sciy = scix + STRIDE;
sciz = sciy + STRIDE;
- ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_SSE);
- ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_SSE);
- ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_SSE);
- ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_SSE);
- iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_SSE);
- iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_SSE);
- iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_SSE);
- iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_SSE);
- iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_SSE);
- iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_SSE);
- iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_SSE);
- iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_SSE);
+ ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
+ ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
+ ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
+ ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
+ iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
+ iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
+ iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
+ iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
+ iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
+ iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
+ iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
+ iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
if (do_coul)
{
- iq_SSE0 = gmx_set1_pr(facel*q[sci]);
- iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
- iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
- iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
+ iq_S0 = gmx_set1_pr(facel*q[sci]);
+ iq_S1 = gmx_set1_pr(facel*q[sci+1]);
+ iq_S2 = gmx_set1_pr(facel*q[sci+2]);
+ iq_S3 = gmx_set1_pr(facel*q[sci+3]);
}
#ifdef LJ_COMB_LB
- hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
- hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
- hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
- hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
- seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
- seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
- seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
- seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ hsig_i_S0 = gmx_load1_pr(ljc+sci2+0);
+ hsig_i_S1 = gmx_load1_pr(ljc+sci2+1);
+ hsig_i_S2 = gmx_load1_pr(ljc+sci2+2);
+ hsig_i_S3 = gmx_load1_pr(ljc+sci2+3);
+ seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
#else
#ifdef LJ_COMB_GEOM
- c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
- c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
+ c6s_S0 = gmx_load1_pr(ljc+sci2+0);
+ c6s_S1 = gmx_load1_pr(ljc+sci2+1);
if (!half_LJ)
{
- c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
- c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
+ c6s_S2 = gmx_load1_pr(ljc+sci2+2);
+ c6s_S3 = gmx_load1_pr(ljc+sci2+3);
}
- c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
- c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
+ c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
+ c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
if (!half_LJ)
{
- c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
- c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
+ c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
+ c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
}
#else
nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
#endif
/* Zero the potential energy for this list */
- VvdwtotSSE = gmx_setzero_pr();
- vctotSSE = gmx_setzero_pr();
+ Vvdwtot_S = gmx_setzero_pr();
+ vctot_S = gmx_setzero_pr();
/* Clear i atom forces */
- fix_SSE0 = gmx_setzero_pr();
- fix_SSE1 = gmx_setzero_pr();
- fix_SSE2 = gmx_setzero_pr();
- fix_SSE3 = gmx_setzero_pr();
- fiy_SSE0 = gmx_setzero_pr();
- fiy_SSE1 = gmx_setzero_pr();
- fiy_SSE2 = gmx_setzero_pr();
- fiy_SSE3 = gmx_setzero_pr();
- fiz_SSE0 = gmx_setzero_pr();
- fiz_SSE1 = gmx_setzero_pr();
- fiz_SSE2 = gmx_setzero_pr();
- fiz_SSE3 = gmx_setzero_pr();
+ fix_S0 = gmx_setzero_pr();
+ fix_S1 = gmx_setzero_pr();
+ fix_S2 = gmx_setzero_pr();
+ fix_S3 = gmx_setzero_pr();
+ fiy_S0 = gmx_setzero_pr();
+ fiy_S1 = gmx_setzero_pr();
+ fiy_S2 = gmx_setzero_pr();
+ fiy_S3 = gmx_setzero_pr();
+ fiz_S0 = gmx_setzero_pr();
+ fiz_S1 = gmx_setzero_pr();
+ fiz_S2 = gmx_setzero_pr();
+ fiz_S3 = gmx_setzero_pr();
cjind = cjind0;
/* Add accumulated i-forces to the force array */
#if UNROLLJ >= 4
#ifndef GMX_DOUBLE
-#define gmx_load_ps4 _mm_load_ps
-#define gmx_store_ps4 _mm_store_ps
-#define gmx_add_ps4 _mm_add_ps
+#define gmx_load_pr4 _mm_load_ps
+#define gmx_store_pr4 _mm_store_ps
+#define gmx_add_pr4 _mm_add_ps
#else
-#define gmx_load_ps4 _mm256_load_pd
-#define gmx_store_ps4 _mm256_store_pd
-#define gmx_add_ps4 _mm256_add_pd
+#define gmx_load_pr4 _mm256_load_pd
+#define gmx_store_pr4 _mm256_store_pd
+#define gmx_add_pr4 _mm256_add_pd
#endif
- GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3, fix_SSE);
- gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
+ gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3, fiy_SSE);
- gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
+ gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
- GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3, fiz_SSE);
- gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
+ gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
#ifdef CALC_SHIFTFORCES
- gmx_store_ps4(shf, fix_SSE);
+ gmx_store_pr4(shf, fix_S);
fshift[ish3+0] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiy_SSE);
+ gmx_store_pr4(shf, fiy_S);
fshift[ish3+1] += SUM_SIMD4(shf);
- gmx_store_ps4(shf, fiz_SSE);
+ gmx_store_pr4(shf, fiz_S);
fshift[ish3+2] += SUM_SIMD4(shf);
#endif
#else
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
- _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
- GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
- _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
+ _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
+ _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
- _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
- _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
+ _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
+ _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
- _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
- GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
- _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
+ _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
+ GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
+ _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
#ifdef CALC_SHIFTFORCES
- _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
fshift[ish3+0] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
fshift[ish3+1] += shf[0] + shf[1];
- _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
+ _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
fshift[ish3+2] += shf[0] + shf[1];
#endif
#endif
#ifdef CALC_ENERGIES
if (do_coul)
{
- gmx_store_pr(tmpsum, vctotSSE);
+ gmx_store_pr(tmpsum, vctot_S);
*Vc += SUM_SIMD(tmpsum);
}
- gmx_store_pr(tmpsum, VvdwtotSSE);
+ gmx_store_pr(tmpsum, Vvdwtot_S);
*Vvdw += SUM_SIMD(tmpsum);
#endif
#endif
}
-#undef gmx_load_ps4
-#undef gmx_store_ps4
-#undef gmx_store_ps4
+
+#undef gmx_load_pr4
+#undef gmx_store_pr4
+#undef gmx_store_pr4
#undef CALC_SHIFTFORCES
#undef STRIDE
#undef TAB_FDV0
#undef NBFP_STRIDE
+
+#undef GMX_USE_HALF_WIDTH_SIMD_HERE