fixed incorrect group energies with nbnxn CPU kernels
authorBerk Hess <hess@kth.se>
Thu, 25 Oct 2012 11:35:18 +0000 (13:35 +0200)
committerBerk Hess <hess@kth.se>
Fri, 26 Oct 2012 11:34:55 +0000 (13:34 +0200)
Fixed two energy group indexing errors in the nbnxn CPU kernels.
The code has now been generalized and we avoid code duplication.

Change-Id: I524d0d2c86b54a92ae124f2c067ccef3daac1515

src/mdlib/nbnxn_atomdata.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h

index bd83a96589c3315c37d7458bb4d8a5c4923ec3f2..0441011087ea345e686a047b20c9093fad2146ff 100644 (file)
@@ -617,9 +617,13 @@ void nbnxn_atomdata_init(FILE *fp,
     if (!simple)
     {
         /* Energy groups not supported yet for super-sub lists */
+        if (n_energygroups > 1 && fp != NULL)
+        {
+            fprintf(fp,"\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
+        }
         nbat->nenergrp = 1;
     }
-    /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
+    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
     if (nbat->nenergrp > 64)
     {
         gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
@@ -793,7 +797,7 @@ static void copy_egp_to_nbat_egps(const int *a,int na,int na_round,
     j = 0;
     for(i=0; i<na; i+=na_c)
     {
-        /* Store na_c energy groups number into one int */
+        /* Store na_c energy group numbers into one int */
         comb = 0;
         for(sa=0; sa<na_c; sa++)
         {
index 8bdcfb4aa6d04e9edd88fdadd25ce4c20b1ca625..442938e675f89af78c9766903e5c74a69f179846 100644 (file)
@@ -57,3 +57,33 @@ clear_fshift(real *fshift)
     }
 }
 
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
+                           int                        nlist,
+                           real                       *Vvdw,
+                           real                       *Vc)
+{
+    int nb;
+    int i,j,ind,indr;
+
+    for(nb=0; nb<nlist; nb++)
+    {
+        for(i=0; i<nbat->nenergrp; i++)
+        {
+            /* Reduce the diagonal terms */
+            ind = i*nbat->nenergrp + i;
+            Vvdw[ind] += nbat->out[nb].Vvdw[ind];
+            Vc[ind]   += nbat->out[nb].Vc[ind];
+
+            /* Reduce the off-diagonal terms */
+            for(j=i+1; j<nbat->nenergrp; j++)
+            {
+                /* The output should contain only one off-diagonal part */
+                ind  = i*nbat->nenergrp + j;
+                indr = j*nbat->nenergrp + i;
+                Vvdw[ind] += nbat->out[nb].Vvdw[ind] + nbat->out[nb].Vvdw[indr];
+                Vc[ind]   += nbat->out[nb].Vc[ind]   + nbat->out[nb].Vc[indr];
+            }
+        }
+    }
+}
index eacbaee44024d450ebd7671c1ac472aeaba4ddf9..716ed7f7187746c53bb96fcb7e0e65b8488ac250 100644 (file)
@@ -49,6 +49,12 @@ clear_f(const nbnxn_atomdata_t *nbat,real *f);
 void
 clear_fshift(real *fshift);
 
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
+                           int                        nlist,
+                           real                       *Vvdw,
+                           real                       *Vc);
+
 #if 0
 {
 #endif
index c1f9e2e40a8080b4e324188b939c47a60ae77a5d..b3b95c9c1c9af93b0a72b536427bd4b60162d4e5 100644 (file)
@@ -250,16 +250,6 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
 
     if (force_flags & GMX_FORCE_ENERGY)
     {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
     }
 }
index 8d7497a5159cad615ca4e41199a1b6254cea56a5..a068914b8721fbd7181b97c5d31d14126f549392 100644 (file)
@@ -127,8 +127,6 @@ static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
     nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
     nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
 
-#endif /* SSE */
-
 
 static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
@@ -136,18 +134,13 @@ static void reduce_group_energies(int ng,int ng_2log,
 {
     int ng_p2,i,j,j0,j1,c,s;
 
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH   4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH   2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
 
     ng_p2 = (1<<ng_2log);
 
-    /* The size of the SSE energy group buffer array is:
-     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+    /* The size of the x86 SIMD energy group buffer array is:
+     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
      */
     for(i=0; i<ng; i++)
     {
@@ -161,8 +154,8 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_2; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_HALF; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
@@ -175,6 +168,8 @@ static void reduce_group_energies(int ng,int ng_2log,
     }
 }
 
+#endif /* GMX_X86_SSE2 */
+
 void
 nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
@@ -296,17 +291,7 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
 
     if (force_flags & GMX_FORCE_ENERGY)
     {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
     }
 }
 #else
index 2396702b6e5550b92e75db355725d16edf87fc13..da236732daa2fbcc20d3fadb07d501aeae9b2bbe 100644 (file)
@@ -127,8 +127,6 @@ static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
     nbnxn_kernel_x86_simd256_tab_twin_comb_lb_noener,
     nbnxn_kernel_x86_simd256_tab_twin_comb_none_noener } };
 
-#endif /* SSE */
-
 
 static void reduce_group_energies(int ng,int ng_2log,
                                   const real *VSvdw,const real *VSc,
@@ -136,18 +134,13 @@ static void reduce_group_energies(int ng,int ng_2log,
 {
     int ng_p2,i,j,j0,j1,c,s;
 
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH   4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH   2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
 
     ng_p2 = (1<<ng_2log);
 
-    /* The size of the SSE energy group buffer array is:
-     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+    /* The size of the x86 SIMD energy group buffer array is:
+     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
      */
     for(i=0; i<ng; i++)
     {
@@ -161,8 +154,8 @@ static void reduce_group_energies(int ng,int ng_2log,
         {
             for(j0=0; j0<ng; j0++)
             {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_2; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_HALF; s++)
                 {
                     Vvdw[i*ng+j0] += VSvdw[c+0];
                     Vvdw[i*ng+j1] += VSvdw[c+1];
@@ -175,6 +168,8 @@ static void reduce_group_energies(int ng,int ng_2log,
     }
 }
 
+#endif /* GMX_X86_AVX_256 */
+
 void
 nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
@@ -296,17 +291,7 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
 
     if (force_flags & GMX_FORCE_ENERGY)
     {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
     }
 }
 #else
index b9fdd34efc6a4e3989c555494fe26a73293cb658..56d04acee5c6b6c62d79be87b9e609375f3deac8 100644 (file)
@@ -58,9 +58,8 @@
             int        cj,aj,ajx,ajy,ajz;
 
 #ifdef ENERGY_GROUPS
-            int        egps_j;
-            int        egp_jj[UNROLLJ>>1];
-            int        jj;
+            /* Energy group indices for two atoms packed into one int */
+            int        egp_jj[UNROLLJ/2];
 #endif
 
 #ifdef CHECK_EXCLS
             
 #ifdef CALC_ENERGIES
 #ifdef ENERGY_GROUPS
-            /* Extract the group pair index per j pair */
+            /* Extract the group pair index per j pair.
+             * Energy groups are stored per i-cluster, so things get
+             * complicated when the i- and j-cluster size don't match.
+             */
+            {
+                int egps_j;
 #if UNROLLJ == 2
-            egps_j        = nbat->energrp[cj>>1];
-            egp_jj[0]     = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+                egps_j    = nbat->energrp[cj>>1];
+                egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
 #else
-            egps_j        = nbat->energrp[cj];
-            for(jj=0; jj<(UNROLLJ>>1); jj++)
-            {
-                egp_jj[jj]  = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
-            }
+                /* We assume UNROLLI <= UNROLLJ */
+                int jdi;
+                for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+                {
+                    int jj;
+                    egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+                    for(jj=0; jj<(UNROLLI/2); jj++)
+                    {
+                        egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+                    }
+                }
 #endif
+            }
 #endif
 
 #ifdef CALC_COULOMB
index 960d783e5ec4454c710faa408281b899be168104..02b6e21fbcd5aefd36af2e1849d98ed8485c12a5 100644 (file)
@@ -492,7 +492,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
         {
             int ia,egp_ia;
 
-            for(ia=0; ia<4; ia++)
+            for(ia=0; ia<UNROLLI; ia++)
             {
                 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
index 4ef461092254bdb46abbae7229bb75f4f11f7ba5..2624c30d3e4275393cf5936d2eee4033f42da526 100644 (file)
@@ -477,7 +477,7 @@ static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
      * the rapidly increases number of combinations of energy groups.
      * We add to a temporary buffer for 1 i-group vs 2 j-groups.
      */
-    for(jj=0; jj<(UNROLLJ>>1); jj++)
+    for(jj=0; jj<(UNROLLJ/2); jj++)
     {
         gmx_mm_pr v_SSE;