Merge "Extended build information output and reference build type" into release-4-6
authorErik Lindahl <erik@kth.se>
Mon, 26 Nov 2012 13:30:00 +0000 (14:30 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 26 Nov 2012 13:30:00 +0000 (14:30 +0100)
12 files changed:
include/gmx_x86_simd_macros.h
include/types/nb_verlet.h
include/types/nrnb.h
src/gmxlib/nrnb.c
src/mdlib/forcerec.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h
src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h
src/mdlib/sim_util.c

index b896d396baadfa88505d90541c207911e91ccac9..e08cd224ef427cf0946e57cf5d13ef7fa6c3ca71 100644 (file)
@@ -70,6 +70,9 @@
 #undef gmx_calc_rsq_pr
 #undef gmx_sum4_pr
 
+#undef gmx_pmecorrF_pr
+#undef gmx_pmecorrV_pr
+
 
 /* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
  * the same intrinsics, with defines, can be compiled for either 128 or 256
 #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_ps
 #define gmx_sum4_pr       gmx_mm_sum4_ps
 
+#define gmx_pmecorrF_pr   gmx_mm_pmecorrF_ps
+#define gmx_pmecorrV_pr   gmx_mm_pmecorrV_ps
+
 #else /* ifndef GMX_DOUBLE */
 
 #include "gmx_x86_simd_double.h"
 #define gmx_calc_rsq_pr   gmx_mm_calc_rsq_pd
 #define gmx_sum4_pr       gmx_mm_sum4_pd
 
+#define gmx_pmecorrF_pr   gmx_mm_pmecorrF_pd
+#define gmx_pmecorrV_pr   gmx_mm_pmecorrV_pd
+
 #endif /* ifndef GMX_DOUBLE */
 
 #endif /* GMX_MM128_HERE */
 #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_ps
 #define gmx_sum4_pr       gmx_mm256_sum4_ps
 
+#define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_ps
+#define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_ps
+
 #else
 
 #include "gmx_x86_simd_double.h"
 #define gmx_calc_rsq_pr   gmx_mm256_calc_rsq_pd
 #define gmx_sum4_pr       gmx_mm256_sum4_pd
 
+#define gmx_pmecorrF_pr   gmx_mm256_pmecorrF_pd
+#define gmx_pmecorrV_pr   gmx_mm256_pmecorrV_pd
+
 #endif
 
 #endif /* GMX_MM256_HERE */
index a6e7ecf67ac2a2d52e3f0474fcdf1a8ea3ea54e9..fcbe6eaf7336bc17c2c220bc361a1016f2a99f01 100644 (file)
@@ -86,6 +86,8 @@ static const char *nbk_name[] =
 #endif
     "CUDA 8x8x8", "plain C 8x8x8" };
 
+enum { ewaldexclTable, ewaldexclAnalytical };
+
 /* Atom locality indicator: local, non-local, all, used for calls to:
    gridding, pair-search, force calculation, x/f buffer operations */
 enum { eatLocal = 0, eatNonlocal = 1, eatAll  };
@@ -109,6 +111,7 @@ typedef struct {
     nbnxn_pairlist_set_t nbl_lists;   /* pair list(s)                       */
     nbnxn_atomdata_t     *nbat;       /* atom data                          */
     int                  kernel_type; /* non-bonded kernel - see enum above */
+    int                  ewald_excl;  /* Ewald exclusion - see enum above   */
 } nonbonded_verlet_group_t;
 
 /* non-bonded data structure with Verlet-type cut-off */
index 0be7f1cd71144412a4d7b8772ef31b6e4247a881..ba594c2ba6fbd366f45f9d99dc7617e46a17b949 100644 (file)
@@ -80,11 +80,13 @@ enum
     eNR_NBKERNEL_ALLVSALLGB,
 
     eNR_NBNXN_DIST2,
-    eNR_NBNXN_LJ_RF,  eNR_NBNXN_LJ_RF_E,
-    eNR_NBNXN_LJ_TAB, eNR_NBNXN_LJ_TAB_E,
-    eNR_NBNXN_LJ,     eNR_NBNXN_LJ_E,
-    eNR_NBNXN_RF,     eNR_NBNXN_RF_E,
-    eNR_NBNXN_TAB,    eNR_NBNXN_TAB_E,
+    eNR_NBNXN_LJ_RF,    eNR_NBNXN_LJ_RF_E,
+    eNR_NBNXN_LJ_TAB,   eNR_NBNXN_LJ_TAB_E,
+    eNR_NBNXN_LJ_EWALD, eNR_NBNXN_LJ_EWALD_E,
+    eNR_NBNXN_LJ,       eNR_NBNXN_LJ_E,
+    eNR_NBNXN_RF,       eNR_NBNXN_RF_E,
+    eNR_NBNXN_TAB,      eNR_NBNXN_TAB_E,
+    eNR_NBNXN_EWALD,    eNR_NBNXN_EWALD_E,
     eNR_NB14,
     eNR_BORN_RADII_STILL,     eNR_BORN_RADII_HCT_OBC,
     eNR_BORN_CHAINRULE,
index 3a3e5d5e226144ca3894148d3b313b4e6bdc6ba5..09c56faea7436048530b4c7a4a318c475679bf8b 100644 (file)
@@ -102,14 +102,18 @@ static const t_nrnb_data nbdata[eNRNB] = {
      */
     { "NxN RF Elec. + VdW [F]",         38 }, /* nbnxn kernel LJ+RF, no ener */
     { "NxN RF Elec. + VdW [V&F]",       54 },
-    { "NxN CSTab Elec. + VdW [F]",      41 }, /* nbnxn kernel LJ+tab, no en */
-    { "NxN CSTab Elec. + VdW [V&F]",    59 },
+    { "NxN QSTab Elec. + VdW [F]",      41 }, /* nbnxn kernel LJ+tab, no en */
+    { "NxN QSTab Elec. + VdW [V&F]",    59 },
+    { "NxN Ewald Elec. + VdW [F]",      66 }, /* nbnxn kernel LJ+Ewald, no en */
+    { "NxN Ewald Elec. + VdW [V&F]",   107 },
     { "NxN VdW [F]",                    33 }, /* nbnxn kernel LJ, no ener */
     { "NxN VdW [V&F]",                  43 },
     { "NxN RF Electrostatics [F]",      31 }, /* nbnxn kernel RF, no ener */
     { "NxN RF Electrostatics [V&F]",    36 },
-    { "NxN CSTab Elec. [F]",            34 }, /* nbnxn kernel tab, no ener */
-    { "NxN CSTab Elec. [V&F]",          41 },
+    { "NxN QSTab Elec. [F]",            34 }, /* nbnxn kernel tab, no ener */
+    { "NxN QSTab Elec. [V&F]",          41 },
+    { "NxN Ewald Elec. [F]",            61 }, /* nbnxn kernel Ewald, no ener */
+    { "NxN Ewald Elec. [V&F]",          84 },
     { "1,4 nonbonded interactions",     90 },
     { "Born radii (Still)",             47 },
     { "Born radii (HCT/OBC)",          183 },
@@ -258,8 +262,8 @@ void print_flop(FILE *out,t_nrnb *nrnb,double *nbfs,double *mflop)
 
   if (out)
   {
-      fprintf(out," NB=Group-cutoff nonbonded kernels    NxN=N-by-N tile Verlet kernels\n");
-      fprintf(out," RF=Reaction-Field  VdW=Van der Waals  CSTab=Cubic-spline table\n");
+      fprintf(out," NB=Group-cutoff nonbonded kernels    NxN=N-by-N cluster Verlet kernels\n");
+      fprintf(out," RF=Reaction-Field  VdW=Van der Waals  QSTab=quadratic-spline table\n");
       fprintf(out," W3=SPC/TIP3p  W4=TIP4p (single or pairs)\n");
       fprintf(out," V&F=Potential and force  V=Potential only  F=Force only\n\n");
 
index 09af686f8e46562e1c2a2284924c0942670b9cf3..fa56c0064119e38409b30a7d1279aca465dda52c 100644 (file)
@@ -1401,9 +1401,11 @@ static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 static void pick_nbnxn_kernel_cpu(FILE *fp,
                                   const t_commrec *cr,
                                   const gmx_cpuid_t cpuid_info,
-                                  int *kernel_type)
+                                  int *kernel_type,
+                                  int *ewald_excl)
 {
     *kernel_type = nbk4x4_PlainC;
+    *ewald_excl  = ewaldexclTable;
 
 #ifdef GMX_X86_SSE2
     {
@@ -1436,6 +1438,23 @@ static void pick_nbnxn_kernel_cpu(FILE *fp,
             gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
 #endif
         }
+
+        /* Analytical Ewald exclusion correction is only an option in the
+         * x86 SIMD kernel. This is faster in single precision
+         * on Bulldozer and slightly faster on Sandy Bridge.
+         */
+#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+        *ewald_excl = ewaldexclAnalytical;
+#endif
+        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+        {
+            *ewald_excl = ewaldexclTable;
+        }
+        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+        {
+            *ewald_excl = ewaldexclAnalytical;
+        }
+
     }
 #endif /* GMX_X86_SSE2 */
 }
@@ -1445,7 +1464,8 @@ static void pick_nbnxn_kernel(FILE *fp,
                               const gmx_hw_info_t *hwinfo,
                               gmx_bool use_cpu_acceleration,
                               gmx_bool *bUseGPU,
-                              int *kernel_type)
+                              int *kernel_type,
+                              int *ewald_excl)
 {
     gmx_bool bEmulateGPU, bGPU;
     char gpu_err_str[STRLEN];
@@ -1453,6 +1473,7 @@ static void pick_nbnxn_kernel(FILE *fp,
     assert(kernel_type);
 
     *kernel_type = nbkNotSet;
+    *ewald_excl  = ewaldexclTable;
     /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
     bGPU = (bUseGPU != NULL) && hwinfo->bCanUseGPU;
 
@@ -1500,7 +1521,8 @@ static void pick_nbnxn_kernel(FILE *fp,
     {
         if (use_cpu_acceleration)
         {
-            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,kernel_type);
+            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
+                                  kernel_type,ewald_excl);
         }
         else
         {
@@ -1722,7 +1744,8 @@ static void init_nb_verlet(FILE *fp,
         {
             pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                               &nbv->bUseGPU,
-                              &nbv->grp[i].kernel_type);
+                              &nbv->grp[i].kernel_type,
+                              &nbv->grp[i].ewald_excl);
         }
         else /* non-local */
         {
@@ -1731,7 +1754,8 @@ static void init_nb_verlet(FILE *fp,
                 /* Use GPU for local, select a CPU kernel for non-local */
                 pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
                                   NULL,
-                                  &nbv->grp[i].kernel_type);
+                                  &nbv->grp[i].kernel_type,
+                                  &nbv->grp[i].ewald_excl);
 
                 bHybridGPURun = TRUE;
             }
index a068914b8721fbd7181b97c5d31d14126f549392..bdbe504e12bd4155bd60e2a997c2321da4b8aba7 100644 (file)
 
 #undef CALC_COUL_TAB
 
+/* Analytical Ewald exclusion interaction electrostatics kernels */
+#define CALC_COUL_EWALD
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_x86_simd_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_EWALD
+
 
 typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
                                 const nbnxn_atomdata_t     *nbat,
@@ -91,41 +104,34 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
                                   real                       *f,
                                   real                       *fshift);
 
-enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
-
+enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd128_rf_comb_geom_ener,
-    nbnxn_kernel_x86_simd128_rf_comb_lb_ener,
-    nbnxn_kernel_x86_simd128_rf_comb_none_ener },
-  { nbnxn_kernel_x86_simd128_tab_comb_geom_ener,
-    nbnxn_kernel_x86_simd128_tab_comb_lb_ener,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener },
-  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_ener,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_ener,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener }  };
-
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd128_rf_comb_geom_energrp,
-    nbnxn_kernel_x86_simd128_rf_comb_lb_energrp,
-    nbnxn_kernel_x86_simd128_rf_comb_none_energrp },
-  { nbnxn_kernel_x86_simd128_tab_comb_geom_energrp,
-    nbnxn_kernel_x86_simd128_tab_comb_lb_energrp,
-    nbnxn_kernel_x86_simd128_tab_comb_none_energrp },
-  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_energrp,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_energrp,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_none_energrp } };
-
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd128_rf_comb_geom_noener,
-    nbnxn_kernel_x86_simd128_rf_comb_lb_noener,
-    nbnxn_kernel_x86_simd128_rf_comb_none_noener },
-  { nbnxn_kernel_x86_simd128_tab_comb_geom_noener,
-    nbnxn_kernel_x86_simd128_tab_comb_lb_noener,
-    nbnxn_kernel_x86_simd128_tab_comb_none_noener },
-  { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_noener,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
-    nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
 
 
 static void reduce_group_energies(int ng,int ng_2log,
@@ -174,6 +180,7 @@ void
 nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
                          const interaction_const_t  *ic,
+                         int                        ewald_excl,
                          rvec                       *shift_vec, 
                          int                        force_flags,
                          int                        clearF,
@@ -196,13 +203,27 @@ nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
     }
     else
     {
-        if (ic->rcoulomb == ic->rvdw)
+        if (ewald_excl == ewaldexclTable)
         {
-            coult = coultTAB;
+            if (ic->rcoulomb == ic->rvdw)
+            {
+                coult = coultTAB;
+            }
+            else
+            {
+                coult = coultTAB_TWIN;
+            }
         }
         else
         {
-            coult = coultTAB_TWIN;
+            if (ic->rcoulomb == ic->rvdw)
+            {
+                coult = coultEWALD;
+            }
+            else
+            {
+                coult = coultEWALD_TWIN;
+            }
         }
     }
 
index 5732f9e1ca0a7ebcabf1ed982886413b16af62f9..5488bfa22fd464c88d7cfaddd6c7188cdc0bdcc4 100644 (file)
@@ -46,6 +46,7 @@ void
 nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
                          const interaction_const_t  *ic,
+                         int                        ewald_excl,
                          rvec                       *shift_vec,
                          int                        force_flags,
                          int                        clearF,
index da236732daa2fbcc20d3fadb07d501aeae9b2bbe..89aac413cbcf1d313eaef2ce60b048aa230298c5 100644 (file)
 
 #undef CALC_COUL_TAB
 
+/* Analytical Ewald exclusion interaction electrostatics kernels */
+#define CALC_COUL_EWALD
+
+/* Single cut-off: rcoulomb = rvdw */
+#include "nbnxn_kernel_x86_simd_includes.h"
+
+/* Twin cut-off: rcoulomb >= rvdw */
+#define VDW_CUTOFF_CHECK
+#include "nbnxn_kernel_x86_simd_includes.h"
+#undef VDW_CUTOFF_CHECK
+
+#undef CALC_COUL_EWALD
+
 
 typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
                                 const nbnxn_atomdata_t     *nbat,
@@ -91,41 +104,34 @@ typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
                                   real                       *f,
                                   real                       *fshift);
 
-enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
-
+enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd256_rf_comb_geom_ener,
-    nbnxn_kernel_x86_simd256_rf_comb_lb_ener,
-    nbnxn_kernel_x86_simd256_rf_comb_none_ener },
-  { nbnxn_kernel_x86_simd256_tab_comb_geom_ener,
-    nbnxn_kernel_x86_simd256_tab_comb_lb_ener,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_none_ener },
-  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_ener,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_ener,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_none_ener }  };
-
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd256_rf_comb_geom_energrp,
-    nbnxn_kernel_x86_simd256_rf_comb_lb_energrp,
-    nbnxn_kernel_x86_simd256_rf_comb_none_energrp },
-  { nbnxn_kernel_x86_simd256_tab_comb_geom_energrp,
-    nbnxn_kernel_x86_simd256_tab_comb_lb_energrp,
-    nbnxn_kernel_x86_simd256_tab_comb_none_energrp },
-  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_energrp,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_energrp,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_none_energrp } };
-
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
+
+#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
-{ { nbnxn_kernel_x86_simd256_rf_comb_geom_noener,
-    nbnxn_kernel_x86_simd256_rf_comb_lb_noener,
-    nbnxn_kernel_x86_simd256_rf_comb_none_noener },
-  { nbnxn_kernel_x86_simd256_tab_comb_geom_noener,
-    nbnxn_kernel_x86_simd256_tab_comb_lb_noener,
-    nbnxn_kernel_x86_simd256_tab_comb_none_noener },
-  { nbnxn_kernel_x86_simd256_tab_twin_comb_geom_noener,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_lb_noener,
-    nbnxn_kernel_x86_simd256_tab_twin_comb_none_noener } };
+{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
+  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
+  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
+  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
+  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
+#undef NBK_FN
 
 
 static void reduce_group_energies(int ng,int ng_2log,
@@ -174,6 +180,7 @@ void
 nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
                          const interaction_const_t  *ic,
+                         int                        ewald_excl,
                          rvec                       *shift_vec, 
                          int                        force_flags,
                          int                        clearF,
@@ -196,13 +203,27 @@ nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
     }
     else
     {
-        if (ic->rcoulomb == ic->rvdw)
+        if (ewald_excl == ewaldexclTable)
         {
-            coult = coultTAB;
+            if (ic->rcoulomb == ic->rvdw)
+            {
+                coult = coultTAB;
+            }
+            else
+            {
+                coult = coultTAB_TWIN;
+            }
         }
         else
         {
-            coult = coultTAB_TWIN;
+            if (ic->rcoulomb == ic->rvdw)
+            {
+                coult = coultEWALD;
+            }
+            else
+            {
+                coult = coultEWALD_TWIN;
+            }
         }
     }
 
index c56754284e609505b57b1859a0b4f034c32c43ee..d2246c93517078532e55242d7f491495b5c8951e 100644 (file)
@@ -46,6 +46,7 @@ void
 nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
                          const nbnxn_atomdata_t     *nbat,
                          const interaction_const_t  *ic,
+                         int                        ewald_excl,
                          rvec                       *shift_vec,
                          int                        force_flags,
                          int                        clearF,
index 56d04acee5c6b6c62d79be87b9e609375f3deac8..ad818dbeb0040b135daaaf916285e13c11268a29 100644 (file)
@@ -45,7 +45,7 @@
 #define EXCL_FORCES
 #endif
 
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS && !(defined __GNUC__ && (defined CALC_COUL_TAB || (defined CALC_COUL_RF && defined GMX_MM128_HERE)))
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES || defined CALC_COUL_EWALD) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS && !(defined __GNUC__ && (defined CALC_COUL_TAB || (defined CALC_COUL_RF && defined GMX_MM128_HERE)))
 /* Without exclusions and energies we only need to mask the cut-off,
  * this is faster with blendv (only available with SSE4.1 and later).
  * With gcc and PME or RF in 128-bit, blendv is slower;
             gmx_mm_pr  fsub_SSE2;
             gmx_mm_pr  fsub_SSE3;
 #endif
+#ifdef CALC_COUL_EWALD
+            gmx_mm_pr  brsq_SSE0,brsq_SSE1,brsq_SSE2,brsq_SSE3;
+            gmx_mm_pr  ewcorr_SSE0,ewcorr_SSE1,ewcorr_SSE2,ewcorr_SSE3;
+#endif
+
             /* frcoul = (1/r - fsub)*r */
             gmx_mm_pr  frcoul_SSE0;
             gmx_mm_pr  frcoul_SSE1;
             gmx_mm_pr  ctabv_SSE1;
             gmx_mm_pr  ctabv_SSE2;
             gmx_mm_pr  ctabv_SSE3;
+#endif
+#endif
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
             /* The potential (PME mesh) we need to subtract from 1/r */
             gmx_mm_pr  vc_sub_SSE0;
             gmx_mm_pr  vc_sub_SSE1;
             gmx_mm_pr  vc_sub_SSE2;
             gmx_mm_pr  vc_sub_SSE3;
 #endif
-#endif
 #ifdef CALC_ENERGIES
             /* Electrostatic potential */
             gmx_mm_pr  vcoul_SSE0;
 #endif
 #endif
 
+#ifdef CALC_COUL_EWALD
+            brsq_SSE0     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE0,wco_SSE0));
+            brsq_SSE1     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE1,wco_SSE1));
+            brsq_SSE2     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE2,wco_SSE2));
+            brsq_SSE3     = gmx_mul_pr(beta2_SSE,gmx_and_pr(rsq_SSE3,wco_SSE3));
+            ewcorr_SSE0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE0),beta_SSE);
+            ewcorr_SSE1   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE1),beta_SSE);
+            ewcorr_SSE2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE2),beta_SSE);
+            ewcorr_SSE3   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_SSE3),beta_SSE);
+            frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(ewcorr_SSE0,brsq_SSE0)));
+            frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(ewcorr_SSE1,brsq_SSE1)));
+            frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(ewcorr_SSE2,brsq_SSE2)));
+            frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(ewcorr_SSE3,brsq_SSE3)));
+
+#ifdef CALC_ENERGIES
+            vc_sub_SSE0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE0),beta_SSE);
+            vc_sub_SSE1   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE1),beta_SSE);
+            vc_sub_SSE2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE2),beta_SSE);
+            vc_sub_SSE3   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_SSE3),beta_SSE);
+#endif
+
+#endif /* CALC_COUL_EWALD */
+
 #ifdef CALC_COUL_TAB
             /* Electrostatic interactions */
             r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
             vc_sub_SSE1   = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1)));
             vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
             vc_sub_SSE3   = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3)));
+#endif
+#endif /* CALC_COUL_TAB */
 
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 #ifndef NO_SHIFT_EWALD
             /* Add Ewald potential shift to vc_sub for convenience */
 #ifdef CHECK_EXCLS
             vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3));
 
 #endif
-#endif
 
 #ifdef CALC_ENERGIES
             /* Mask energy for cut-off and diagonal */
index 02b6e21fbcd5aefd36af2e1849d98ed8485c12a5..153e9f5b20cca110c9494f06b6b87b5399ea6824 100644 (file)
 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
 #endif
 #endif
+#ifdef CALC_COUL_EWALD
+#ifndef VDW_CUTOFF_CHECK
+#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
+#else
+#define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
+#endif
+#endif
 
 #ifdef GMX_MM128_HERE
 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
@@ -270,6 +277,7 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #ifdef CALC_ENERGIES
     gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
 #endif
+
 #ifdef CALC_COUL_TAB
     /* Coulomb table variables */
     gmx_mm_pr  invtsp_SSE;
@@ -285,10 +293,17 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #endif
 #ifdef CALC_ENERGIES
     gmx_mm_pr  mhalfsp_SSE;
-    gmx_mm_pr  sh_ewald_SSE;
 #endif
 #endif
 
+#ifdef CALC_COUL_EWALD
+    gmx_mm_pr beta2_SSE,beta_SSE;
+#endif
+
+#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+    gmx_mm_pr  sh_ewald_SSE;
+#endif
+
 #ifdef LJ_COMB_LB
     const real *ljc;
 
@@ -368,8 +383,6 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
 #ifdef CALC_ENERGIES
     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
-
-    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 #endif
 
 #ifdef TAB_FDV0
@@ -378,6 +391,15 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
     tab_coul_F = ic->tabq_coul_F;
     tab_coul_V = ic->tabq_coul_V;
 #endif
+#endif /* CALC_COUL_TAB */
+
+#ifdef CALC_COUL_EWALD
+    beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
+    beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
+#endif
+
+#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
+    sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 #endif
 
     q                   = nbat->q;
@@ -523,6 +545,10 @@ NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 #else
             Vc_sub_self = 0.5*tab_coul_V[0];
 #endif
+#endif
+#ifdef CALC_COUL_EWALD
+            /* 0.5*beta*2/sqrt(pi) */
+            Vc_sub_self = 0.5*ic->ewaldcoeff*1.128379167095513;
 #endif
 
             for(ia=0; ia<UNROLLI; ia++)
index ecdd223fa0e06192e6d7e1f9c77a41ff36cfd090..bd75bdc1e036e1d2ae7a193d978384ed1ed305f9 100644 (file)
@@ -598,7 +598,7 @@ static void do_nb_verlet(t_forcerec *fr,
                          t_nrnb *nrnb,
                          gmx_wallcycle_t wcycle)
 {
-    int     nnbl, kernel_type, sh_e;
+    int     nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
     char    *env;
     nonbonded_verlet_group_t  *nbvg;
 
@@ -638,6 +638,7 @@ static void do_nb_verlet(t_forcerec *fr,
         case nbk4xN_X86_SIMD128:
             nbnxn_kernel_x86_simd128(&nbvg->nbl_lists,
                                      nbvg->nbat, ic,
+                                     nbvg->ewald_excl,
                                      fr->shift_vec,
                                      flags,
                                      clearF,
@@ -650,6 +651,7 @@ static void do_nb_verlet(t_forcerec *fr,
         case nbk4xN_X86_SIMD256:
             nbnxn_kernel_x86_simd256(&nbvg->nbl_lists,
                                      nbvg->nbat, ic,
+                                     nbvg->ewald_excl,
                                      fr->shift_vec,
                                      flags,
                                      clearF,
@@ -687,16 +689,31 @@ static void do_nb_verlet(t_forcerec *fr,
         wallcycle_sub_stop(wcycle, ewcsNONBONDED);
     }
 
-    /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
-    sh_e = ((flags & GMX_FORCE_ENERGY) ? 1 : 0);
-    inc_nrnb(nrnb,
-             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
-              eNR_NBNXN_LJ_RF : eNR_NBNXN_LJ_TAB) + sh_e,
+    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+    {
+        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
+    }
+    else if (nbvg->ewald_excl == ewaldexclTable)
+    {
+        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+    }
+    else
+    {
+        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+    }
+    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
+    if (flags & GMX_FORCE_ENERGY)
+    {
+        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+        enr_nbnxn_kernel_ljc += 1;
+        enr_nbnxn_kernel_lj  += 1;
+    }
+
+    inc_nrnb(nrnb,enr_nbnxn_kernel_ljc,
              nbvg->nbl_lists.natpair_ljq);
-    inc_nrnb(nrnb,eNR_NBNXN_LJ+sh_e,nbvg->nbl_lists.natpair_lj);
-    inc_nrnb(nrnb,
-             ((EEL_RF(ic->eeltype) || ic->eeltype == eelCUT) ?
-              eNR_NBNXN_RF : eNR_NBNXN_TAB)+sh_e,
+    inc_nrnb(nrnb,enr_nbnxn_kernel_lj,
+             nbvg->nbl_lists.natpair_lj);
+    inc_nrnb(nrnb,enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
              nbvg->nbl_lists.natpair_q);
 }