#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#define TAB_FDV0
#else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
#define SUM_SIMD(x) (x[0]+x[1])
#endif
#endif
#ifdef GMX_MM256_HERE
#ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
#define TAB_FDV0
#else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
#define SUM_SIMD(x) SUM_SIMD4(x)
#endif
#endif
int nbfp_stride;
int n,ci,ci_sh;
int ish,ish3;
- gmx_bool half_LJ,do_coul;
+ gmx_bool do_LJ,half_LJ,do_coul;
int sci,scix,sciy,sciz,sci2;
int cjind0,cjind1,cjind;
int ip,jp;
__m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
#endif
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
#ifndef GMX_DOUBLE
__m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
__m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
__m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
__m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
#endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
/* AVX: use floating point masks, as there are no integer instructions */
#ifndef GMX_DOUBLE
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
#endif
#endif
-#ifndef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
- __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+ gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_SSE0,diag_SSE1,diag_SSE2,diag_SSE3;
#else
- __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#else /* GMX_MM256_HERE */
-#ifndef GMX_DOUBLE
- gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
- gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
+ gmx_mm_pr diag0_SSE0,diag0_SSE1,diag0_SSE2,diag0_SSE3;
+ gmx_mm_pr diag1_SSE0,diag1_SSE1,diag1_SSE2,diag1_SSE3;
#endif
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
__m128i zeroi_SSE = _mm_setzero_si128();
#endif
-#ifdef GMX_X86_SSE4_1
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
-#endif
gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
gmx_mm_pr iq_SSE0=gmx_setzero_pr();
const real *tab_coul_V;
#endif
#ifdef GMX_MM256_HERE
- int ti0_array[2*UNROLLJ-1],*ti0;
- int ti1_array[2*UNROLLJ-1],*ti1;
- int ti2_array[2*UNROLLJ-1],*ti2;
- int ti3_array[2*UNROLLJ-1],*ti3;
+ int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
+ int ti1_array[2*GMX_SIMD_WIDTH_HERE-1],*ti1;
+ int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
+ int ti3_array[2*GMX_SIMD_WIDTH_HERE-1],*ti3;
#endif
#ifdef CALC_ENERGIES
gmx_mm_pr mhalfsp_SSE;
nbfp_stride = NBFP_STRIDE;
#endif
+ /* Load j-i for the first i */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+ diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
+ diag0_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+
+#if UNROLLI == 2*UNROLLJ
+ /* Load j-i for the second half of the j-cluster */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+#endif
+
+ diag1_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#endif
+#endif
+
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
- /* Generate aligned table pointers */
- ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
- ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
+ /* Generate aligned table index pointers */
+ ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
+ ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
#endif
invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
egps_jshift = 2*nbat->neg_2log;
egps_jmask = (1<<egps_jshift) - 1;
egps_jstride = (UNROLLJ>>1)*UNROLLJ;
- /* Major division is over i-particles: divide nVS by 4 for i-stride */
+ /* Major division is over i-particle energy groups, determine the stride */
Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
#endif
ish = (nbln->shift & NBNXN_CI_SHIFT);
ish3 = ish*3;
- cjind0 = nbln->cj_ind_start;
- cjind1 = nbln->cj_ind_end;
- /* Currently only works super-cells equal to sub-cells */
+ cjind0 = nbln->cj_ind_start;
+ cjind1 = nbln->cj_ind_end;
ci = nbln->ci;
ci_sh = (ish == CENTRAL ? ci : -1);
sci += (ci & 1)*(STRIDE>>1);
#endif
- half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+ /* We have 5 LJ/C combinations, but use only three inner loops,
+ * as the other combinations are unlikely and/or not much faster:
+ * inner half-LJ + C for half-LJ + C / no-LJ + C
+ * inner LJ + C for full-LJ + C
+ * inner LJ for full-LJ + no-C / half-LJ + no-C
+ */
+ do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+ half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
#ifdef ENERGY_GROUPS
egps_i = nbat->energrp[ci];
iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
- /* With half_LJ we currently always calculate Coulomb interactions */
- if (do_coul || half_LJ)
+ if (do_coul)
{
iq_SSE0 = gmx_set1_pr(facel*q[sci]);
iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);