fixed incorrect group energies with nbnxn CPU kernels

author Berk Hess <hess@kth.se>

Thu, 25 Oct 2012 11:35:18 +0000 (13:35 +0200)

committer Berk Hess <hess@kth.se>

Fri, 26 Oct 2012 11:34:55 +0000 (13:34 +0200)
author Berk Hess <hess@kth.se>
Thu, 25 Oct 2012 11:35:18 +0000 (13:35 +0200)
committer Berk Hess <hess@kth.se>
Fri, 26 Oct 2012 11:34:55 +0000 (13:34 +0200)
diff --git a/src/mdlib/nbnxn_atomdata.c b/src/mdlib/nbnxn_atomdata.c

index bd83a96589c3315c37d7458bb4d8a5c4923ec3f2..0441011087ea345e686a047b20c9093fad2146ff 100644 (file)
--- a/src/mdlib/nbnxn_atomdata.c
+++ b/src/mdlib/nbnxn_atomdata.c
@@ -617,9 +617,13 @@ void nbnxn_atomdata_init(FILE *fp,
      if (!simple)
      {
          /* Energy groups not supported yet for super-sub lists */
+        if (n_energygroups > 1 && fp != NULL)
+        {
+            fprintf(fp,"\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
+        }
          nbat->nenergrp = 1;
      }
-    /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
+    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
      if (nbat->nenergrp > 64)
      {
          gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
@@ -793,7 +797,7 @@ static void copy_egp_to_nbat_egps(const int *a,int na,int na_round,
      j = 0;
      for(i=0; i<na; i+=na_c)
      {
-        /* Store na_c energy groups number into one int */
+        /* Store na_c energy group numbers into one int */
          comb = 0;
          for(sa=0; sa<na_c; sa++)
          {
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c

index 8bdcfb4aa6d04e9edd88fdadd25ce4c20b1ca625..442938e675f89af78c9766903e5c74a69f179846 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
@@ -57,3 +57,33 @@ clear_fshift(real *fshift)
      }
  }
  
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
+                           int                        nlist,
+                           real                       *Vvdw,
+                           real                       *Vc)
+{
+    int nb;
+    int i,j,ind,indr;
+
+    for(nb=0; nb<nlist; nb++)
+    {
+        for(i=0; i<nbat->nenergrp; i++)
+        {
+            /* Reduce the diagonal terms */
+            ind = i*nbat->nenergrp + i;
+            Vvdw[ind] += nbat->out[nb].Vvdw[ind];
+            Vc[ind]   += nbat->out[nb].Vc[ind];
+
+            /* Reduce the off-diagonal terms */
+            for(j=i+1; j<nbat->nenergrp; j++)
+            {
+                /* The output should contain only one off-diagonal part */
+                ind  = i*nbat->nenergrp + j;
+                indr = j*nbat->nenergrp + i;
+                Vvdw[ind] += nbat->out[nb].Vvdw[ind] + nbat->out[nb].Vvdw[indr];
+                Vc[ind]   += nbat->out[nb].Vc[ind]   + nbat->out[nb].Vc[indr];
+            }
+        }
+    }
+}
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h

index eacbaee44024d450ebd7671c1ac472aeaba4ddf9..716ed7f7187746c53bb96fcb7e0e65b8488ac250 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
@@ -49,6 +49,12 @@ clear_f(const nbnxn_atomdata_t *nbat,real *f);
  void
  clear_fshift(real *fshift);
  
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
+                           int                        nlist,
+                           real                       *Vvdw,
+                           real                       *Vc);
+
  #if 0
  {
  #endif
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c

index c1f9e2e40a8080b4e324188b939c47a60ae77a5d..b3b95c9c1c9af93b0a72b536427bd4b60162d4e5 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
@@ -250,16 +250,6 @@ nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
  
      if (force_flags & GMX_FORCE_ENERGY)
      {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
      }
  }
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c

index 8d7497a5159cad615ca4e41199a1b6254cea56a5..a068914b8721fbd7181b97c5d31d14126f549392 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
@@ -127,8 +127,6 @@ static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
      nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
      nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
  
-#endif /* SSE */
-
  
  static void reduce_group_energies(int ng,int ng_2log,
                                    const real *VSvdw,const real *VSc,
@@ -136,18 +134,13 @@ static void reduce_group_energies(int ng,int ng_2log,
  {
      int ng_p2,i,j,j0,j1,c,s;
  
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH   4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH   2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
  
      ng_p2 = (1<<ng_2log);
  
-    /* The size of the SSE energy group buffer array is:
-     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+    /* The size of the x86 SIMD energy group buffer array is:
+     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
       */
      for(i=0; i<ng; i++)
      {
@@ -161,8 +154,8 @@ static void reduce_group_energies(int ng,int ng_2log,
          {
              for(j0=0; j0<ng; j0++)
              {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_2; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_HALF; s++)
                  {
                      Vvdw[i*ng+j0] += VSvdw[c+0];
                      Vvdw[i*ng+j1] += VSvdw[c+1];
@@ -175,6 +168,8 @@ static void reduce_group_energies(int ng,int ng_2log,
      }
  }
  
+#endif /* GMX_X86_SSE2 */
+
  void
  nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
                           const nbnxn_atomdata_t     *nbat,
@@ -296,17 +291,7 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
  
      if (force_flags & GMX_FORCE_ENERGY)
      {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
      }
  }
  #else
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c

index 2396702b6e5550b92e75db355725d16edf87fc13..da236732daa2fbcc20d3fadb07d501aeae9b2bbe 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
@@ -127,8 +127,6 @@ static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
      nbnxn_kernel_x86_simd256_tab_twin_comb_lb_noener,
      nbnxn_kernel_x86_simd256_tab_twin_comb_none_noener } };
  
-#endif /* SSE */
-
  
  static void reduce_group_energies(int ng,int ng_2log,
                                    const real *VSvdw,const real *VSc,
@@ -136,18 +134,13 @@ static void reduce_group_energies(int ng,int ng_2log,
  {
      int ng_p2,i,j,j0,j1,c,s;
  
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH   4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH   2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
  
      ng_p2 = (1<<ng_2log);
  
-    /* The size of the SSE energy group buffer array is:
-     * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+    /* The size of the x86 SIMD energy group buffer array is:
+     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
       */
      for(i=0; i<ng; i++)
      {
@@ -161,8 +154,8 @@ static void reduce_group_energies(int ng,int ng_2log,
          {
              for(j0=0; j0<ng; j0++)
              {
-                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
-                for(s=0; s<SIMD_WIDTH_2; s++)
+                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+                for(s=0; s<SIMD_WIDTH_HALF; s++)
                  {
                      Vvdw[i*ng+j0] += VSvdw[c+0];
                      Vvdw[i*ng+j1] += VSvdw[c+1];
@@ -175,6 +168,8 @@ static void reduce_group_energies(int ng,int ng_2log,
      }
  }
  
+#endif /* GMX_X86_AVX_256 */
+
  void
  nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
                           const nbnxn_atomdata_t     *nbat,
@@ -296,17 +291,7 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
  
      if (force_flags & GMX_FORCE_ENERGY)
      {
-        /* Reduce the energies */
-        for(nb=0; nb<nnbl; nb++)
-        {
-            int i;
-
-            for(i=0; i<nbat->out[nb].nV; i++)
-            {
-                Vvdw[i] += nbat->out[nb].Vvdw[i];
-                Vc[i]   += nbat->out[nb].Vc[i];
-            }
-        }
+        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
      }
  }
  #else
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h

index b9fdd34efc6a4e3989c555494fe26a73293cb658..56d04acee5c6b6c62d79be87b9e609375f3deac8 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
@@ -58,9 +58,8 @@
              int        cj,aj,ajx,ajy,ajz;
  
  #ifdef ENERGY_GROUPS
-            int        egps_j;
-            int        egp_jj[UNROLLJ>>1];
-            int        jj;
+            /* Energy group indices for two atoms packed into one int */
+            int        egp_jj[UNROLLJ/2];
  #endif
  
  #ifdef CHECK_EXCLS
@@ -777,17 +776,29 @@
              
  #ifdef CALC_ENERGIES
  #ifdef ENERGY_GROUPS
-            /* Extract the group pair index per j pair */
+            /* Extract the group pair index per j pair.
+             * Energy groups are stored per i-cluster, so things get
+             * complicated when the i- and j-cluster size don't match.
+             */
+            {
+                int egps_j;
  #if UNROLLJ == 2
-            egps_j        = nbat->energrp[cj>>1];
-            egp_jj[0]     = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+                egps_j    = nbat->energrp[cj>>1];
+                egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
  #else
-            egps_j        = nbat->energrp[cj];
-            for(jj=0; jj<(UNROLLJ>>1); jj++)
-            {
-                egp_jj[jj]  = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
-            }
+                /* We assume UNROLLI <= UNROLLJ */
+                int jdi;
+                for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+                {
+                    int jj;
+                    egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+                    for(jj=0; jj<(UNROLLI/2); jj++)
+                    {
+                        egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+                    }
+                }
  #endif
+            }
  #endif
  
  #ifdef CALC_COULOMB
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h

index 960d783e5ec4454c710faa408281b899be168104..02b6e21fbcd5aefd36af2e1849d98ed8485c12a5 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
@@ -492,7 +492,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
          {
              int ia,egp_ia;
  
-            for(ia=0; ia<4; ia++)
+            for(ia=0; ia<UNROLLI; ia++)
              {
                  egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
                  vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h

index 4ef461092254bdb46abbae7229bb75f4f11f7ba5..2624c30d3e4275393cf5936d2eee4033f42da526 100644 (file)
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h
@@ -477,7 +477,7 @@ static inline void add_ener_grp(gmx_mm_pr e_SSE,real *v,int *offset_jj)
       * the rapidly increases number of combinations of energy groups.
       * We add to a temporary buffer for 1 i-group vs 2 j-groups.
       */
-    for(jj=0; jj<(UNROLLJ>>1); jj++)
+    for(jj=0; jj<(UNROLLJ/2); jj++)
      {
          gmx_mm_pr v_SSE;
author	Berk Hess <hess@kth.se>
	Thu, 25 Oct 2012 11:35:18 +0000 (13:35 +0200)
committer	Berk Hess <hess@kth.se>
	Fri, 26 Oct 2012 11:34:55 +0000 (13:34 +0200)
src/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_common.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_common.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h		patch \| blob \| history
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_utils.h		patch \| blob \| history