minor speed-up and code clean-up in nbnxn kernels

author Berk Hess <hess@kth.se>

Wed, 2 Jan 2013 10:42:50 +0000 (11:42 +0100)

committer Berk Hess <hess@kth.se>

Wed, 2 Jan 2013 21:48:12 +0000 (22:48 +0100)
author Berk Hess <hess@kth.se>
Wed, 2 Jan 2013 10:42:50 +0000 (11:42 +0100)
committer Berk Hess <hess@kth.se>
Wed, 2 Jan 2013 21:48:12 +0000 (22:48 +0100)
diff --git a/include/types/nbnxn_pairlist.h b/include/types/nbnxn_pairlist.h

index b6bc9650c6d1d367ee4fe5e426958a1d74bd4924..4d337cf1a3f4a49260a2b6becafa5b6724b62df5 100644 (file)
--- a/include/types/nbnxn_pairlist.h
+++ b/include/types/nbnxn_pairlist.h
@@ -71,6 +71,14 @@ typedef struct {
      unsigned excl;  /* The exclusion (interaction) bits */
  } nbnxn_cj_t;
  
+/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
+ * The upper bits contain information for non-bonded kernel optimization.
+ * Simply calculating LJ and Coulomb for all pairs in a cluster pair is fine.
+ * But three flags can be used to skip interactions, currently only for subc=0
+ * !(shift & NBNXN_CI_DO_LJ(subc))   => we can skip LJ for all pairs
+ * shift & NBNXN_CI_HALF_LJ(subc)    => we can skip LJ for the second half of i
+ * !(shift & NBNXN_CI_DO_COUL(subc)) => we can skip Coulomb for all pairs
+ */
  #define NBNXN_CI_SHIFT          127
  #define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
  #define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
@@ -79,7 +87,7 @@ typedef struct {
  /* Simple pair-list i-unit */
  typedef struct {
      int ci;             /* i-cluster             */
-    int shift;          /* Shift vector index plus possible flags */
+    int shift;          /* Shift vector index plus possible flags, see above */
      int cj_ind_start;   /* Start index into cj   */
      int cj_ind_end;     /* End index into cj     */
  } nbnxn_ci_t;
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c

index dc0758a3fccc3ecd0592fee4a932c8e4af81a476..c1955a99548da156ae3e27ad8b764c97997509dd 100644 (file)
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1433,7 +1433,7 @@ static void pick_nbnxn_kernel_cpu(FILE *fp,
  #endif
          if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
          {
-#ifdef GMX_NBNXN_SIMD_2XNN
+#ifdef GMX_NBNXN_SIMD_4XN
              *kernel_type = nbnxnk4xN_SIMD_4xN;
  #else
              gmx_fatal(FARGS,"SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h

index 97a0ef84b9dc535806fa6e60a7935f913619edb4..ce5a6734c8953ba4ab7bb48f1890e226871e84ed 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h
@@ -108,7 +108,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
      real       *nbfp_i;
      int        n,ci,ci_sh;
      int        ish,ishf;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
      int        cjind0,cjind1,cjind;
      int        ip,jp;
  
@@ -213,8 +213,15 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
          do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
  
  #ifdef CALC_ENERGIES
  #ifndef ENERGY_GROUPS
@@ -237,8 +244,7 @@ NBK_FUNC_NAME(nbnxn_kernel_ref,energrp)
              }
          }
  
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
          {
  #ifdef CALC_ENERGIES
              real Vc_sub_self;
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h

index faa445efbfb1fb2ef9d67becdbc88ed1a9b46de2..78242d73cec848e792ceb3b70ec804a98caa7351 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
@@ -35,7 +35,7 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-/* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
+/* GMX_MM256_HERE should be set before including this file */
  #include "gmx_simd_macros.h"
  
  #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
@@ -43,32 +43,17 @@
  #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
  
-#if defined GMX_MM128_HERE || defined GMX_DOUBLE
-#define STRIDE     4
-#endif
-#if defined GMX_MM256_HERE && !defined GMX_DOUBLE
+#if defined GMX_MM256_HERE
  #define STRIDE     4
  #endif 
  
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
-#define TAB_FDV0
-#else
-/* SSE double precision 4x2 kernel */
-#define SUM_SIMD(x) (x[0]+x[1])
-#endif
-#endif
-
  #ifdef GMX_MM256_HERE
  #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 2x(4+4) kernel */
  #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  #define TAB_FDV0
  #else
-/* AVX double precision 4x4 kernel */
-#define SUM_SIMD(x) SUM_SIMD4(x)
+#error "unsupported kernel configuration"
  #endif
  #endif
  
@@ -167,7 +152,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
      int        nbfp_stride;
      int        n,ci,ci_sh;
      int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
      int        sci,scix,sciy,sciz,sci2;
      int        cjind0,cjind1,cjind;
      int        ip,jp;
@@ -206,9 +191,6 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
      gmx_mm_pr  diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
      gmx_mm_pr  diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
  
-#ifndef GMX_MM256_HERE
-    __m128i    zeroi_SSE = _mm_setzero_si128();
-#endif
  #ifdef GMX_X86_SSE4_1
      gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
  #endif
@@ -407,7 +389,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
      egps_jshift  = 2*nbat->neg_2log;
      egps_jmask   = (1<<egps_jshift) - 1;
      egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
      Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
  #endif
  
@@ -420,9 +402,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
  
          ish              = (nbln->shift & NBNXN_CI_SHIFT);
          ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
@@ -441,8 +422,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
          sci             += (ci & 1)*(STRIDE>>1);
  #endif
  
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
          do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
  
  #ifdef ENERGY_GROUPS
          egps_i = nbat->energrp[ci];
@@ -513,8 +501,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
          iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz)  ,shZ_SSE);
          iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
  
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
          {
              gmx_mm_pr facel_SSE;
  
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

index 1ab915deaecc3e5670ac0de2fffbd6b7da69bf78..1545d40380c8d48fcc21b16b96477343ef881dfe 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
@@ -52,22 +52,22 @@
  
  #ifdef GMX_MM128_HERE
  #ifndef GMX_DOUBLE
-/* SSE single precision 4x4 kernel */
+/* single precision 4x4 kernel */
  #define SUM_SIMD(x) SUM_SIMD4(x)
  #define TAB_FDV0
  #else
-/* SSE double precision 4x2 kernel */
+/* double precision 4x2 kernel */
  #define SUM_SIMD(x) (x[0]+x[1])
  #endif
  #endif
  
  #ifdef GMX_MM256_HERE
  #ifndef GMX_DOUBLE
-/* AVX single precision 4x8 kernel */
+/* single precision 4x8 kernel */
  #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  #define TAB_FDV0
  #else
-/* AVX double precision 4x4 kernel */
+/* double precision 4x4 kernel */
  #define SUM_SIMD(x) SUM_SIMD4(x)
  #endif
  #endif
@@ -167,7 +167,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      int        nbfp_stride;
      int        n,ci,ci_sh;
      int        ish,ish3;
-    gmx_bool   half_LJ,do_coul;
+    gmx_bool   do_LJ,half_LJ,do_coul;
      int        sci,scix,sciy,sciz,sci2;
      int        cjind0,cjind1,cjind;
      int        ip,jp;
@@ -203,7 +203,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
  #endif
  
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
  #ifndef GMX_DOUBLE
      __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
      __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
@@ -216,7 +216,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
      __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
  #endif
-#else
+#endif
+#ifdef GMX_MM256_HERE
      /* AVX: use floating point masks, as there are no integer instructions */
  #ifndef GMX_DOUBLE
      gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
@@ -230,7 +231,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
  #endif
  #endif
  
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
  #ifndef GMX_DOUBLE
      __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
      __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -246,7 +247,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
      __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
  #endif
-#else /* GMX_MM256_HERE */
+#endif
+#ifdef GMX_MM256_HERE
  #ifndef GMX_DOUBLE
      gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
      gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
@@ -264,7 +266,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
  #endif
  #endif
  
-#ifndef GMX_MM256_HERE
+#ifdef GMX_MM128_HERE
      __m128i    zeroi_SSE = _mm_setzero_si128();
  #endif
  #ifdef GMX_X86_SSE4_1
@@ -475,7 +477,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
      egps_jshift  = 2*nbat->neg_2log;
      egps_jmask   = (1<<egps_jshift) - 1;
      egps_jstride = (UNROLLJ>>1)*UNROLLJ;
-    /* Major division is over i-particles: divide nVS by 4 for i-stride */
+    /* Major division is over i-particle energy groups, determine the stride */
      Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
  #endif
  
@@ -488,9 +490,8 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
  
          ish              = (nbln->shift & NBNXN_CI_SHIFT);
          ish3             = ish*3;
-        cjind0           = nbln->cj_ind_start;      
-        cjind1           = nbln->cj_ind_end;    
-        /* Currently only works super-cells equal to sub-cells */
+        cjind0           = nbln->cj_ind_start;
+        cjind1           = nbln->cj_ind_end;
          ci               = nbln->ci;
          ci_sh            = (ish == CENTRAL ? ci : -1);
  
@@ -509,8 +510,15 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
          sci             += (ci & 1)*(STRIDE>>1);
  #endif
  
-        half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
+        /* We have 5 LJ/C combinations, but use only three inner loops,
+         * as the other combinations are unlikely and/or not much faster:
+         * inner half-LJ + C for half-LJ + C / no-LJ + C
+         * inner LJ + C      for full-LJ + C
+         * inner LJ          for full-LJ + no-C / half-LJ + no-C
+         */
+        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
          do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
+        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
  
  #ifdef ENERGY_GROUPS
          egps_i = nbat->energrp[ci];
@@ -585,8 +593,7 @@ NBK_FUNC_NAME(nbnxn_kernel_simd_4xn,energrp)
          iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
          iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
  
-        /* With half_LJ we currently always calculate Coulomb interactions */
-        if (do_coul || half_LJ)
+        if (do_coul)
          {
              iq_SSE0      = gmx_set1_pr(facel*q[sci]);
              iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c

index 0358e3523d40217794eafd3a8067056093e33f36..dd6b2c31df62d58d00fa3327c7dd63a9a12d71fa 100644 (file)
--- a/src/mdlib/nbnxn_search.c
+++ b/src/mdlib/nbnxn_search.c
@@ -3362,13 +3362,17 @@ static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
      {
          sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
  
-        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
+        /* The counts below are used for non-bonded pair/flop counts
+         * and should therefore match the available kernel setups.
+         */
+        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
          {
-            nbl->work->ncj_hlj += jlen;
+            nbl->work->ncj_noq += jlen;
          }
-        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
+        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
+                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
          {
-            nbl->work->ncj_noq += jlen;
+            nbl->work->ncj_hlj += jlen;
          }
  
          nbl->nci++;
author	Berk Hess <hess@kth.se>
	Wed, 2 Jan 2013 10:42:50 +0000 (11:42 +0100)
committer	Berk Hess <hess@kth.se>
	Wed, 2 Jan 2013 21:48:12 +0000 (22:48 +0100)
include/types/nbnxn_pairlist.h		patch \| blob \| history
src/mdlib/forcerec.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_ref_outer.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h		patch \| blob \| history
src/mdlib/nbnxn_search.c		patch \| blob \| history