Merge "Fixing a problem with dh/dl 1-4 interactions" into release-4-6

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)
diff --git a/include/mdrun.h b/include/mdrun.h

index 1373bfd55eaa0d0bcfb88aba99e2eb16587e8b76..a11fa78ac34145c2bdf2f912be8dc3e1e9e941e7 100644 (file)
--- a/include/mdrun.h
+++ b/include/mdrun.h
@@ -215,7 +215,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
               real rdd, real rconstr, const char *dddlb_opt, real dlb_scale,
               const char *ddcsx, const char *ddcsy, const char *ddcsz,
               const char *nbpu_opt,
-             int nsteps_cmdline, int nstepout, int resetstep,
+             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
               int nmultisim, int repl_ex_nst, int repl_ex_nex,
               int repl_ex_seed, real pforce, real cpt_period, real max_hours,
               const char *deviceOptions, unsigned long Flags);
diff --git a/src/gmxlib/selection/compiler.c b/src/gmxlib/selection/compiler.c

index 0ccb548d3e2a7902c927c06433b8747680fda2b7..326635cdd0d866d1e463d5b5d70bdeb1706de4f9 100644 (file)
--- a/src/gmxlib/selection/compiler.c
+++ b/src/gmxlib/selection/compiler.c
@@ -314,8 +314,15 @@ enum
      SEL_CDATA_MINMAXALLOC = 16,
      /** Whether subexpressions use simple pass evaluation functions. */
      SEL_CDATA_SIMPLESUBEXPR = 32,
-    /** Whether this expressions is a part of a common subexpression. */
-    SEL_CDATA_COMMONSUBEXPR = 64
+    /*! \brief
+     * Whether a static subexpression needs to support multiple evaluations.
+     *
+     * This flag may only be set on \ref SEL_SUBEXPR elements that also have
+     * SEL_CDATA_SIMPLESUBEXPR.
+     */
+    SEL_CDATA_STATICMULTIEVALSUBEXPR = 64,
+    /** Whether this expression is a part of a common subexpression. */
+    SEL_CDATA_COMMONSUBEXPR = 128
  };
  
  /*! \internal \brief
@@ -397,6 +404,10 @@ _gmx_selelem_print_compiler_info(FILE *fp, t_selelem *sel, int level)
      {
          fprintf(fp, "Ss");
      }
+    if (sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR)
+    {
+        fprintf(fp, "Sm");
+    }
      if (sel->cdata->flags & SEL_CDATA_COMMONSUBEXPR)
      {
          fprintf(fp, "Sc");
@@ -1102,19 +1113,25 @@ init_item_evalfunc(t_selelem *sel)
              break;
  
          case SEL_SUBEXPR:
-            sel->evaluate = (sel->refcount == 2
-                             ? &_gmx_sel_evaluate_subexpr_simple
-                             : &_gmx_sel_evaluate_subexpr);
+            if (sel->refcount == 2
+                && !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+            {
+                sel->evaluate = &_gmx_sel_evaluate_subexpr_simple;
+            }
+            else
+            {
+                sel->evaluate = &_gmx_sel_evaluate_subexpr;
+            }
              break;
  
          case SEL_SUBEXPRREF:
              sel->name     = sel->child->name;
-            sel->evaluate = (sel->child->refcount == 2
+            sel->evaluate = ((sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR)
                               ? &_gmx_sel_evaluate_subexprref_simple
                               : &_gmx_sel_evaluate_subexprref);
              break;
      }
-
+    sel->cdata->evaluate = sel->evaluate;
      return TRUE;
  }
  
@@ -1179,7 +1196,8 @@ init_item_evaloutput(t_selelem *sel)
          }
      }
  
-    if (sel->type == SEL_SUBEXPR && sel->refcount == 2)
+    if (sel->type == SEL_SUBEXPR && sel->refcount == 2
+        && !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
      {
          sel->flags &= ~(SEL_ALLOCVAL | SEL_ALLOCDATA);
          if (sel->v.type == GROUP_VALUE || sel->v.type == POS_VALUE)
@@ -1199,7 +1217,8 @@ init_item_evaloutput(t_selelem *sel)
              _gmx_selvalue_setstore(&sel->v, sel->child->v.u.ptr);
          }
      }
-    else if (sel->type == SEL_SUBEXPRREF && sel->child->refcount == 2)
+    else if (sel->type == SEL_SUBEXPRREF
+             && (sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR))
      {
          if (sel->v.u.ptr)
          {
@@ -1361,6 +1380,15 @@ init_item_staticeval(t_selelem *sel)
                      init_item_staticeval(child);
                  }
              }
+            /* If an expression is evaluated for a dynamic group, then also
+             * atom-valued parameters need to be evaluated every time. */
+            if ((sel->flags & SEL_DYNAMIC)
+                && (sel->type == SEL_EXPRESSION || sel->type == SEL_MODIFIER)
+                && (child->flags & SEL_ATOMVAL))
+            {
+                child->flags        |= SEL_DYNAMIC;
+                child->cdata->flags &= ~SEL_CDATA_STATIC;
+            }
              child = child->next;
          }
      }
@@ -1417,7 +1445,17 @@ init_item_subexpr_flags(t_selelem *sel)
      }
      else if (sel->type == SEL_SUBEXPRREF && sel->child->refcount == 2)
      {
-        sel->cdata->flags |= SEL_CDATA_SIMPLESUBEXPR;
+        /* See similar condition in init_item_staticeval(). */
+        if ((sel->flags & SEL_ATOMVAL)
+            && (sel->flags & SEL_DYNAMIC)
+            && !(sel->child->flags & SEL_DYNAMIC))
+        {
+            sel->child->cdata->flags |= SEL_CDATA_STATICMULTIEVALSUBEXPR;
+        }
+        else
+        {
+            sel->cdata->flags |= SEL_CDATA_SIMPLESUBEXPR;
+        }
      }
  
      /* Process children, but only follow subexpression references if the
@@ -2180,7 +2218,9 @@ analyze_static(gmx_sel_evaluate_t *data, t_selelem *sel, gmx_ana_index_t *g)
              break;
  
          case SEL_SUBEXPR:
-            if (sel->cdata->flags & (SEL_CDATA_SIMPLESUBEXPR | SEL_CDATA_FULLEVAL))
+            if (((sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR) &&
+                 !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+                || (sel->cdata->flags & SEL_CDATA_FULLEVAL))
              {
                  rc = sel->cdata->evaluate(data, sel, g);
                  _gmx_selvalue_setstore(&sel->v, sel->child->v.u.ptr);
@@ -2491,6 +2531,17 @@ postprocess_item_subexpressions(t_selelem *sel)
          sel->child->flags   &= ~(SEL_ALLOCVAL | SEL_ALLOCDATA);
          sel->child->v.nalloc = -1;
      }
+
+    /* For static subexpressions with a dynamic evaluation group, there is
+     * no need to evaluate them again, as the SEL_SUBEXPRREF takes care of
+     * everything during evaluation. */
+    if (sel->type == SEL_SUBEXPR
+        && (sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR)
+        && (sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+    {
+        sel->evaluate        = NULL;
+        sel->cdata->evaluate = NULL;
+    }
  }
  
  
@@ -2735,24 +2786,37 @@ gmx_ana_selcollection_compile(gmx_ana_selcollection_t *sc)
              /* FIXME: Clean up the collection */
              return -1;
          }
-        /* Initialize evaluation function. */
-        if (!init_item_evalfunc(item))
-        {
-            /* FIXME: Clean up the collection */
-            return -1;
-        }
-        setup_memory_pooling(item, sc->mempool);
          /* Initialize the compiler data */
          init_item_compilerdata(item);
+        item = item->next;
+    }
+    /* Initialize the static evaluation compiler flags.
+     * Requires the FULLEVAL compiler flag for the whole tree. */
+    item = sc->root;
+    while (item)
+    {
          init_item_staticeval(item);
          item = item->next;
      }
-    /* Initialize subexpression flags and evaluation output.
+    /* Initialize subexpression flags.
       * Requires compiler flags for the full tree. */
      item = sc->root;
      while (item)
      {
          init_item_subexpr_flags(item);
+        item = item->next;
+    }
+    /* Initialize evaluation.
+     * Requires subexpression flags. */
+    item = sc->root;
+    while (item)
+    {
+        if (!init_item_evalfunc(item))
+        {
+            /* FIXME: Clean up the collection */
+            return -1;
+        }
+        setup_memory_pooling(item, sc->mempool);
          init_item_evaloutput(item);
          item = item->next;
      }
diff --git a/src/gmxlib/selection/evaluate.c b/src/gmxlib/selection/evaluate.c

index b77a93eff301999a6eb7d20ab5b2af6d9fe43495..bd09533686c037012d9f1858862fac8d007b19af 100644 (file)
--- a/src/gmxlib/selection/evaluate.c
+++ b/src/gmxlib/selection/evaluate.c
@@ -220,7 +220,8 @@ gmx_ana_selcollection_evaluate(gmx_ana_selcollection_t *sc,
      while (sel)
      {
          /* Clear the evaluation group of subexpressions */
-        if (sel->child && sel->child->type == SEL_SUBEXPR)
+        if (sel->child && sel->child->type == SEL_SUBEXPR
+            && sel->child->evaluate != NULL)
          {
              sel->child->u.cgrp.isize = 0;
              /* Not strictly necessary, because the value will be overwritten
@@ -664,7 +665,7 @@ _gmx_sel_evaluate_subexprref(gmx_sel_evaluate_t *data, t_selelem *sel, gmx_ana_i
      t_selelem *expr;
      int        i, j;
  
-    if (g)
+    if (g != NULL && sel->child->evaluate != NULL)
      {
          int rc;
  
diff --git a/src/kernel/mdrun.c b/src/kernel/mdrun.c

index 8640ee3e016e9db15d8f19f9928f13708839e974..a5c0e6a6256def6a24b6b0bb702c5eef114b58c5 100644 (file)
--- a/src/kernel/mdrun.c
+++ b/src/kernel/mdrun.c
@@ -431,7 +431,7 @@ int cmain(int argc, char *argv[])
      int           repl_ex_nex   = 0;
      int           nstepout      = 100;
      int           resetstep     = -1;
-    int           nsteps        = -2; /* the value -2 means that the mdp option will be used */
+    gmx_large_int_t nsteps      = -2; /* the value -2 means that the mdp option will be used */
  
      rvec          realddxyz          = {0, 0, 0};
      const char   *ddno_opt[ddnoNR+1] =
@@ -521,7 +521,7 @@ int cmain(int argc, char *argv[])
            "Keep and number checkpoint files" },
          { "-append",  FALSE, etBOOL, {&bAppendFiles},
            "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
-        { "-nsteps",  FALSE, etINT, {&nsteps},
+        { "-nsteps",  FALSE, etGMX_LARGE_INT, {&nsteps},
            "Run this number of steps, overrides .mdp file option" },
          { "-maxh",   FALSE, etREAL, {&max_hours},
            "Terminate after 0.99 times this time (hours)" },
diff --git a/src/kernel/runner.c b/src/kernel/runner.c

index 9c0cae32346428955193f00ea9009067094cf727..765be55653ee82ab5793a832b38605b15d323025 100644 (file)
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -131,7 +131,7 @@ struct mdrunner_arglist
      const char     *ddcsy;
      const char     *ddcsz;
      const char     *nbpu_opt;
-    int             nsteps_cmdline;
+    gmx_large_int_t nsteps_cmdline;
      int             nstepout;
      int             resetstep;
      int             nmultisim;
@@ -193,7 +193,8 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
                                           const char *dddlb_opt, real dlb_scale,
                                           const char *ddcsx, const char *ddcsy, const char *ddcsz,
                                           const char *nbpu_opt,
-                                         int nsteps_cmdline, int nstepout, int resetstep,
+                                         gmx_large_int_t nsteps_cmdline,
+                                         int nstepout, int resetstep,
                                           int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
                                           real pforce, real cpt_period, real max_hours,
                                           const char *deviceOptions, unsigned long Flags)
@@ -865,10 +866,12 @@ static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
  
  /* Override the value in inputrec with value passed on the command line (if any) */
  static void override_nsteps_cmdline(FILE            *fplog,
-                                    int              nsteps_cmdline,
+                                    gmx_large_int_t  nsteps_cmdline,
                                      t_inputrec      *ir,
                                      const t_commrec *cr)
  {
+    char sbuf[STEPSTRSIZE];
+
      assert(ir);
      assert(cr);
  
@@ -880,13 +883,14 @@ static void override_nsteps_cmdline(FILE            *fplog,
          ir->nsteps = nsteps_cmdline;
          if (EI_DYNAMICS(ir->eI))
          {
-            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
-                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+                    gmx_step_str(nsteps_cmdline, sbuf),
+                    nsteps_cmdline*ir->delta_t);
          }
          else
          {
-            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
-                    nsteps_cmdline);
+            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+                    gmx_step_str(nsteps_cmdline, sbuf));
          }
  
          md_print_warn(cr, fplog, "%s\n", stmp);
@@ -909,7 +913,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
               const char *dddlb_opt, real dlb_scale,
               const char *ddcsx, const char *ddcsy, const char *ddcsz,
               const char *nbpu_opt,
-             int nsteps_cmdline, int nstepout, int resetstep,
+             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
               int nmultisim, int repl_ex_nst, int repl_ex_nex,
               int repl_ex_seed, real pforce, real cpt_period, real max_hours,
               const char *deviceOptions, unsigned long Flags)
diff --git a/src/mdlib/coupling.c b/src/mdlib/coupling.c

index 0bfd9feb566dfd106f575d837eea388943b4c593..cfcfa0b9d7df41758772d2c1c107e3b20285ed77 100644 (file)
--- a/src/mdlib/coupling.c
+++ b/src/mdlib/coupling.c
@@ -1603,7 +1603,7 @@ void vrescale_tcoupl(t_inputrec *ir, gmx_ekindata_t *ekind, real dt,
              Ek = trace(ekind->tcstat[i].ekinh);
          }
  
-        if (opts->tau_t[i] > 0 && opts->nrdf[i] > 0 && Ek > 0)
+        if (opts->tau_t[i] >= 0 && opts->nrdf[i] > 0 && Ek > 0)
          {
              Ek_ref1 = 0.5*opts->ref_t[i]*BOLTZ;
              Ek_ref  = Ek_ref1*opts->nrdf[i];
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 2611660b2bd3bef01eca5c6d1cedd42e259eaa63..de3bdc9bf7a2c3b55d124cfce81d072b3b940641 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -75,8 +75,13 @@ texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  /***** The kernels come here *****/
  #include "nbnxn_cuda_kernel_utils.cuh"
  
-/* Generate all combinations of kernels through multiple inclusion:
-   F, F + E, F + prune, F + E + prune. */
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
  /** Force only **/
  #include "nbnxn_cuda_kernels.cuh"
  /** Force & energy **/
@@ -126,7 +131,7 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
      /* do we exceed the grid x dimension limit? */
      if (nwork_units > max_grid_x_size)
      {
-        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
+        gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
                    "The number of nonbonded work units (=number of super-clusters) exceeds the"
                    "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
      }
@@ -141,32 +146,46 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
  static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
  static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
  
-/* Default kernels */
+/*! Pointers to the default kernels organized in a 3 dim array by:
+ *  electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ *  Note that the order of electrostatics (1st dimension) has to match the
+ *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
  static const nbnxn_cu_kfunc_ptr_t
  nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
  {
-    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
-      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
-    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
-      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
-    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
-      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
-    { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
-      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
+    { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
+      { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
+    { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
+      { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
+    { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
+      { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
+    { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
+      { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
+    { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
+      { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
+    { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
+      { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
  };
  
-/* Legacy kernels */
+/*! Pointers to the legacy kernels organized in a 3 dim array by:
+ *  electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ *  Note that the order of electrostatics (1st dimension) has to match the
+ *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
  static const nbnxn_cu_kfunc_ptr_t
  nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
  {
-    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
-      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
-    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
-      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
-    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
-      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
-    { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
-      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
+    { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
+      { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
+    { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
+      { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
+    { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
+      { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
+    { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
+      { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
  };
  
  /*! Return a pointer to the kernel version to be executed at the current step. */
@@ -178,6 +197,9 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
  
      if (NBNXN_KVER_LEGACY(kver))
      {
+        /* no analytical Ewald with legacy kernels */
+        assert(eeltype <= eelCuEWALD_TAB_TWIN);
+
          return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
      }
      else
@@ -656,12 +678,19 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
      cudaError_t stat;
  
      for (int i = 0; i < eelCuNR; i++)
+    {
          for (int j = 0; j < nEnergyKernelTypes; j++)
+        {
              for (int k = 0; k < nPruneKernelTypes; k++)
              {
-                /* Legacy kernel 16/48 kB Shared/L1 */
-                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
-                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+                /* Legacy kernel 16/48 kB Shared/L1
+                 * No analytical Ewald!
+                 */
+                if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
+                {
+                    stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+                    CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+                }
  
                  if (devinfo->prop.major >= 3)
                  {
@@ -676,4 +705,6 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
                  }
                  CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
              }
+        }
+    }
  }
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 3107e25460782f5cc707867d1361dca285b8c80a..f2d427b18cbaf0628c77049fe46768774c81d71c 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -180,10 +180,65 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
      ad->nalloc = -1;
  }
  
+/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+    earlier GPUs, single or twin cut-off. */
+static int pick_ewald_kernel_type(bool                   bTwinCut,
+                                  const cuda_dev_info_t *dev_info)
+{
+    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+    int  kernel_type;
+
+    /* Benchmarking/development environment variables to force the use of
+       analytical or tabulated Ewald kernel. */
+    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
+    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+
+    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+    {
+        gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+                   "requested through environment variables.");
+    }
+
+    /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+    if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    {
+        bUseAnalyticalEwald = true;
+
+        if (debug)
+        {
+            fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+        }
+    }
+    else
+    {
+        bUseAnalyticalEwald = false;
+
+        if (debug)
+        {
+            fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+        }
+    }
+
+    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+       forces it (use it for debugging/benchmarking only). */
+    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+    {
+        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+    }
+    else
+    {
+        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+    }
+
+    return kernel_type;
+}
+
+
  /*! Initializes the nonbonded parameter data structure. */
  static void init_nbparam(cu_nbparam_t *nbp,
                           const interaction_const_t *ic,
-                         const nonbonded_verlet_t *nbv)
+                         const nonbonded_verlet_t *nbv,
+                         const cuda_dev_info_t *dev_info)
  {
      cudaError_t stat;
      int         ntypes, nnbfp;
@@ -210,16 +265,8 @@ static void init_nbparam(cu_nbparam_t *nbp,
      }
      else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
      {
-        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
-           forced by the env. var. (used only for benchmarking). */
-        if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
-        {
-            nbp->eeltype = eelCuEWALD;
-        }
-        else
-        {
-            nbp->eeltype = eelCuEWALD_TWIN;
-        }
+        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+        nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
      }
      else
      {
@@ -228,9 +275,9 @@ static void init_nbparam(cu_nbparam_t *nbp,
      }
  
      /* generate table for PME */
-    if (nbp->eeltype == eelCuEWALD)
+    nbp->coulomb_tab = NULL;
+    if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
      {
-        nbp->coulomb_tab = NULL;
          init_ewald_coulomb_force_table(nbp);
      }
  
@@ -256,17 +303,8 @@ void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
      nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
      nbp->ewald_beta     = ic->ewaldcoeff;
  
-    /* When switching to/from twin cut-off, the electrostatics type needs updating.
-       (The env. var. that forces twin cut-off is for benchmarking only!) */
-    if (ic->rcoulomb == ic->rvdw &&
-        getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
-    {
-        nbp->eeltype = eelCuEWALD;
-    }
-    else
-    {
-        nbp->eeltype = eelCuEWALD_TWIN;
-    }
+    nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
+                                                 cu_nb->dev_info);
  
      init_ewald_coulomb_force_table(cu_nb->nbparam);
  }
@@ -609,7 +647,7 @@ void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
                             const nonbonded_verlet_t *nbv)
  {
      init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
-    init_nbparam(cu_nb->nbparam, ic, nbv);
+    init_nbparam(cu_nb->nbparam, ic, nbv, cu_nb->dev_info);
  
      /* clear energy and shift force outputs */
      nbnxn_cuda_clear_e_fshift(cu_nb);
@@ -804,7 +842,7 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
      plist_nl    = cu_nb->plist[eintNonlocal];
      timers      = cu_nb->timers;
  
-    if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
+    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
      {
        stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
        CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 23df93a292c0ffafc6a7a5194c2041d46ba7439e..73acdfecd594e222c40e00b25e0127de1419c839 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -49,6 +49,11 @@
  #define IATYPE_SHMEM
  #endif
  
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
  /*
     Kernel launch parameters:
      - #blocks   = #pair lists, blockId = pair list Id
@@ -95,16 +100,20 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef EL_RF
      float two_k_rf              = nbparam.two_k_rf;
  #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
      float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
  #endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
+    float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
+#endif
  #ifdef PRUNE_NBL
      float rlist_sq              = nbparam.rlist_sq;
  #endif
  
  #ifdef CALC_ENERGIES
      float lj_shift    = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
      float beta        = nbparam.ewald_beta;
      float ewald_shift = nbparam.sh_ewald;
  #else
@@ -185,7 +194,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
      E_lj = 0.0f;
      E_el = 0.0f;
  
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
      if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
      {
          /* we have the diagonal: add the charge self interaction energy term */
@@ -256,7 +265,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                      fcj_buf = make_float3(0.0f);
  
                      /* The PME and RF kernels don't unroll with CUDA <v4.1. */
-#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD || defined EL_RF))
+#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD_ANY || defined EL_RF))
  #pragma unroll 8
  #endif
                      for(i = 0; i < NCL_PER_SUPERCL; i++)
@@ -289,7 +298,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                              int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
  
                              /* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
                              if (r2 < rcoulomb_sq *
                                  (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
  #else
@@ -314,7 +323,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                                  inv_r   = rsqrt(r2);
                                  inv_r2  = inv_r * inv_r;
                                  inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
                                  /* We could mask inv_r2, but with Ewald
                                   * masking both inv_r6 and F_invr is faster */
                                  inv_r6  *= int_bit;
@@ -345,9 +354,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef EL_RF
                                  F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
  #endif
-#ifdef EL_EWALD
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
                                  F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_ANA/TAB */
  
  #ifdef CALC_ENERGIES
  #ifdef EL_CUTOFF
@@ -356,10 +367,10 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #ifdef EL_RF
                                  E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
  #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
                                  /* 1.0f - erff is faster than erfcf */
                                  E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif
+#endif /* EL_EWALD_ANY */
  #endif
                                  f_ij    = rv * F_invr;
  
@@ -440,3 +451,5 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
  #endif
  #endif
  }
+
+#undef EL_EWALD_ANY
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh

index 0733d9412e2577869b9b1afd1304387bc09c334f..cff062d7a1381d3f909c2489e8de614ee527676c 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -42,6 +42,12 @@
   * code that is in double precision.
   */
  
+/* Analytical Ewald is not implemented for the legacy kernels (as it is anyway
+   slower than the tabulated kernel on Fermi). */
+#ifdef EL_EWALD_ANA
+#error Trying to generate Analytical Ewald legacy kernels which is not implemented in the legacy kernels!
+#endif
+
  /*
     Kernel launch parameters:
      - #blocks   = #pair lists, blockId = pair list Id
@@ -88,7 +94,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  #ifdef EL_RF
      float two_k_rf              = nbparam.two_k_rf;
  #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
      float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
  #endif
  #ifdef PRUNE_NBL
@@ -97,7 +103,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  
  #ifdef CALC_ENERGIES
      float lj_shift    = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
      float beta        = nbparam.ewald_beta;
      float ewald_shift = nbparam.sh_ewald;
  #else
@@ -122,14 +128,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
      float qi, qj_f,
            r2, inv_r, inv_r2, inv_r6,
            c6, c12,
+          int_bit,
  #ifdef CALC_ENERGIES
            E_lj, E_el, E_lj_p,
  #endif
            F_invr;
-    unsigned int wexcl, int_bit, imask, imask_j;
-#ifdef PRUNE_NBL
-    unsigned int imask_prune;
-#endif
+    unsigned int wexcl, imask, mask_ji;
      float4 xqbuf;
      float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
      float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
@@ -162,7 +166,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
      E_lj = 0.0f;
      E_el = 0.0f;
  
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
      if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
      {
          /* we have the diagonal: add the charge self interaction energy term */
@@ -201,20 +205,16 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
          if (imask)
  #endif
          {
-#ifdef PRUNE_NBL
-            imask_prune = imask;
-#endif
-
              /* nvcc >v4.1 doesn't like this loop, it refuses to unroll it */
  #if CUDA_VERSION >= 4010
              #pragma unroll 4
  #endif
              for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
              {
-                imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
-                if (imask_j)
+                mask_ji = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
+                if (mask_ji)
                  {
-                    nsubi = __popc(imask_j);
+                    nsubi = __popc(mask_ji);
  
                      cj      = pl_cj4[j4].cj[jm];
                      aj      = cj * CL_SIZE + tidxj;
@@ -233,8 +233,8 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                         which is a pity because here unrolling could help.  */
                      for (cii = 0; cii < nsubi; cii++)
                      {
-                        i = __ffs(imask_j) - 1;
-                        imask_j &= ~(1U << i);
+                        i = __ffs(mask_ji) - 1;
+                        mask_ji &= ~(1U << i);
  
                          ci_offset   = i;    /* i force buffer offset */
  
@@ -255,14 +255,14 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                                 cluster-pair in imask gets set to 0. */
                          if (!__any(r2 < rlist_sq))
                          {
-                            imask_prune &= ~(1U << (jm * NCL_PER_SUPERCL + i));
+                            imask &= ~(1U << (jm * NCL_PER_SUPERCL + i));
                          }
  #endif
  
-                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1);
+                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1) ? 1.0f : 0.0f;
  
                          /* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
                          if (r2 < rcoulomb_sq *
                              (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
  #else
@@ -283,7 +283,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                              inv_r   = rsqrt(r2);
                              inv_r2  = inv_r * inv_r;
                              inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
                              /* We could mask inv_r2, but with Ewald
                               * masking both inv_r6 and F_invr is faster */
                              inv_r6  *= int_bit;
@@ -292,19 +292,19 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                              F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
  
  #ifdef CALC_ENERGIES
-                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
+                            E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
  #endif
  
  #ifdef VDW_CUTOFF_CHECK
-                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
-                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr  *= vdw_in_range;
+                            /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
+                            vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                            F_invr  *= vdw_in_range;
  #ifdef CALC_ENERGIES
-                                E_lj_p  *= vdw_in_range;
+                            E_lj_p  *= vdw_in_range;
  #endif
  #endif
  #ifdef CALC_ENERGIES
-                                E_lj    += E_lj_p;
+                            E_lj    += E_lj_p;
  #endif
  
  
@@ -314,9 +314,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  #ifdef EL_RF
                              F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
  #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
                              F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_TAB */
  
  #ifdef CALC_ENERGIES
  #ifdef EL_CUTOFF
@@ -325,7 +325,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  #ifdef EL_RF
                              E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
  #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
                              /* 1.0f - erff is faster than erfcf */
                              E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
  #endif
@@ -352,7 +352,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
  #ifdef PRUNE_NBL
              /* Update the imask with the new one which does not contain the
                 out of range clusters anymore. */
-            pl_cj4[j4].imei[widx].imask = imask_prune;
+            pl_cj4[j4].imei[widx].imask = imask;
  #endif
          }
      }
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 4992f2beeb38748730e6b43e6dbea2934e9e4d75..ad08ffa5e66b8d592506f8414f13dc3f5e858051 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -69,6 +69,46 @@ float interpolate_coulomb_force_r(float r, float scale)
              + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  }
  
+/*! Calculate analytical Ewald correction term. */
+static inline __device__
+float pmecorrF(float z2)
+{
+    const float FN6 = -1.7357322914161492954e-8f;
+    const float FN5 = 1.4703624142580877519e-6f;
+    const float FN4 = -0.000053401640219807709149f;
+    const float FN3 = 0.0010054721316683106153f;
+    const float FN2 = -0.019278317264888380590f;
+    const float FN1 = 0.069670166153766424023f;
+    const float FN0 = -0.75225204789749321333f;
+
+    const float FD4 = 0.0011193462567257629232f;
+    const float FD3 = 0.014866955030185295499f;
+    const float FD2 = 0.11583842382862377919f;
+    const float FD1 = 0.50736591960530292870f;
+    const float FD0 = 1.0f;
+
+    float       z4;
+    float       polyFN0,polyFN1,polyFD0,polyFD1;
+
+    z4          = z2*z2;
+
+    polyFD0     = FD4*z4 + FD2;
+    polyFD1     = FD3*z4 + FD1;
+    polyFD0     = polyFD0*z4 + FD0;
+    polyFD0     = polyFD1*z2 + polyFD0;
+
+    polyFD0     = 1.0f/polyFD0;
+
+    polyFN0     = FN6*z4 + FN4;
+    polyFN1     = FN5*z4 + FN3;
+    polyFN0     = polyFN0*z4 + FN2;
+    polyFN1     = polyFN1*z4 + FN1;
+    polyFN0     = polyFN0*z4 + FN0;
+    polyFN0     = polyFN1*z2 + polyFN0;
+
+    return polyFN0*polyFD0;
+}
+
  /*! Final j-force reduction; this generic implementation works with
   *  arbitrary array sizes.
   */
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index 64c1008eb76e8450a394b5cf87905abd69fbc8d2..78aaa0dd462da74ee493616b65096f2781736189 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -37,15 +37,17 @@
   */
  
  /*! \file
- *  This header has the sole purpose of generating kernels for the different
- *  type of electrostatics supported: Cut-off, Reaction-Field, and Ewald/PME;
- *  the latter has a twin-range cut-off version (rcoul!=rvdw) which enables
- *  PME tuning (otherwise in the Verlet scheme rcoul==rvdw).
+ *  This header has the sole purpose of generating kernels for the supported
+ *  electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald.
   *
- *  (No include fence as it is meant to be included multiple times.)
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
   */
  
-/* Cut-Off */
+/* Analytical plain cut-off kernels */
  #define EL_CUTOFF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
  #include "nbnxn_cuda_kernel_legacy.cuh"
@@ -53,7 +55,7 @@
  #undef EL_CUTOFF
  #undef NB_KERNEL_FUNC_NAME
  
-/* Reaction-Field */
+/* Analytical reaction-field kernels */
  #define EL_RF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
  #include "nbnxn_cuda_kernel_legacy.cuh"
@@ -61,20 +63,40 @@
  #undef EL_RF
  #undef NB_KERNEL_FUNC_NAME
  
-/* Ewald */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_ANA
  #undef NB_KERNEL_FUNC_NAME
  
-/* Ewald with twin-range cut-off */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
  #define VDW_CUTOFF_CHECK
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_TAB
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
  #include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_TAB
  #undef VDW_CUTOFF_CHECK
  #undef NB_KERNEL_FUNC_NAME
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index 55f67e928838fd526ceacfceadba2900680c6d95..901e784c328860761b006e8cb2ae2f38eefb9316 100644 (file)
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -47,13 +47,25 @@
  extern "C" {
  #endif
  
-/*! Types of electrostatics available in the CUDA nonbonded force kernels. */
+/*! Types of electrostatics implementations available in the CUDA non-bonded
+ *  force kernels. These represent both the electrostatics types implemented
+ *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ *  enums.h) as well as encode implementation details analytical/tabulated
+ *  and single or twin cut-off (for Ewald kernels).
+ *  Note that the cut-off and RF kernels have only analytical flavor and unlike
+ *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ *  The order of pointers to different electrostatic kernels defined in
+ *  nbnxn_cuda.cu by the nb_default_kfunc_ptr and nb_legacy_kfunc_ptr arrays
+ *  should match the order of enumerated types below. */
  enum {
-    eelCuEWALD, eelCuEWALD_TWIN, eelCuRF, eelCuCUT, eelCuNR
+    eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
  };
  
+/*! Kernel flavors with different set of optimizations: default for CUDA <=v4.1
+ *  compilers and legacy for earlier, 3.2 and 4.0 CUDA compilers. */
  enum {
-    eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKOld, eNbnxnCuKNR
+    eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKNR
  };
  
  #define NBNXN_KVER_OLD(k)      (k == eNbnxnCuKOld)
diff --git a/src/mdlib/sim_util.c b/src/mdlib/sim_util.c

index 4156a0b1b1c897042a56c4fb011d0ed76c4968f7..72251d78bbe2b7a4ac85f39245c8bb637c3bd3ee 100644 (file)
--- a/src/mdlib/sim_util.c
+++ b/src/mdlib/sim_util.c
@@ -1162,42 +1162,18 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
          GMX_BARRIER(cr->mpi_comm_mygroup);
      }
+
      if (inputrec->ePull == epullCONSTRAINT)
      {
          clear_pull_forces(inputrec->pull);
      }
  
-    /* update QMMMrec, if necessary */
-    if (fr->bQMMM)
-    {
-        update_QMMMrec(cr, fr, x, mdatoms, box, top);
-    }
-
-    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
-    {
-        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
-                       f, enerd, lambda, fr);
-    }
-
-    /* Compute the bonded and non-bonded energies and optionally forces */
-    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
-                      cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
-                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
-                      &(top->atomtypes), bBornRadii, box,
-                      inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
-                      flags, &cycles_pme);
-
-    if (bSepLRF)
-    {
-        if (do_per_step(step, inputrec->nstcalclr))
-        {
-            /* Add the long range forces to the short range forces */
-            for (i = 0; i < fr->natoms_force_constr; i++)
-            {
-                rvec_add(fr->f_twin[i], f[i], f[i]);
-            }
-        }
-    }
+    /* We calculate the non-bonded forces, when done on the CPU, here.
+     * We do this before calling do_force_lowlevel, as in there bondeds
+     * forces are calculated before PME, which does communication.
+     * With this order, non-bonded and bonded force calculation imbalance
+     * can be balanced out by the domain decomposition load balancing.
+     */
  
      if (!bUseOrEmulGPU)
      {
@@ -1206,7 +1182,6 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                       nrnb, wcycle);
      }
  
-
      if (!bUseOrEmulGPU || bDiffKernels)
      {
          int aloc;
@@ -1248,6 +1223,38 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          }
      }
  
+    /* update QMMMrec, if necessary */
+    if (fr->bQMMM)
+    {
+        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+    }
+
+    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+    {
+        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+                       f, enerd, lambda, fr);
+    }
+
+    /* Compute the bonded and non-bonded energies and optionally forces */
+    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+                      cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+                      &(top->atomtypes), bBornRadii, box,
+                      inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+                      flags, &cycles_pme);
+
+    if (bSepLRF)
+    {
+        if (do_per_step(step, inputrec->nstcalclr))
+        {
+            /* Add the long range forces to the short range forces */
+            for (i = 0; i < fr->natoms_force_constr; i++)
+            {
+                rvec_add(fr->f_twin[i], f[i], f[i]);
+            }
+        }
+    }
+
      cycles_force += wallcycle_stop(wcycle, ewcFORCE);
      GMX_BARRIER(cr->mpi_comm_mygroup);
  
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index b2ff2035a63b816d637815499013a6819c8d2076..bfff897320c17834d86130455197d17f1f21f5f2 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -42,17 +42,29 @@ if(REGRESSIONTEST_DOWNLOAD AND CMAKE_VERSION VERSION_LESS "2.8.2")
          "REGRESSIONTEST_DOWNLOAD not supported with cmake ${CMAKE_VERSION}" FORCE)
  endif()
  if(REGRESSIONTEST_DOWNLOAD)
-    if(NOT REGRESSIONTEST_VERSION)
-        message(FATAL_ERROR "The configuration files do not specify what regressiontests tarball is suitable for automated download and testing. Please obtain and use a suitable set of tests yourself.")
+    if("${PROJECT_VERSION}" MATCHES "-dev")
+       if("${PROJECT_VERSION}" MATCHES "^5[.]")
+          set(REGRESSIONTEST_URL http://gerrit.gromacs.org/snapshot/refs/heads/master)
+       else()
+          set(REGRESSIONTEST_URL http://gerrit.gromacs.org/snapshot/refs/heads/release-4-6)
+      endif()
+      set(REGRESSIONTEST_PATH "${CMAKE_CURRENT_BINARY_DIR}/regressiontests"
+           CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
+    else()
+        if(NOT REGRESSIONTEST_VERSION)
+          message(FATAL_ERROR "The configuration files do not specify what regressiontests tarball is suitable for automated download and testing. Please obtain and use a suitable set of tests yourself.")
+        endif()
+        set(REGRESSIONTEST_URL http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
+        set(REGRESSIONTEST_PATH
+           "${CMAKE_CURRENT_BINARY_DIR}/regressiontests-${REGRESSIONTEST_VERSION}"
+           CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
      endif()
-    set(REGRESSIONTEST_URL
-        http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
      set(REGRESSIONTEST_FILE "${CMAKE_CURRENT_BINARY_DIR}/regressiontests.tgz")
      message("Downloading: ${REGRESSIONTEST_URL}")
      file(DOWNLOAD ${REGRESSIONTEST_URL} "${REGRESSIONTEST_FILE}" SHOW_PROGRESS STATUS status LOG log)
      list(GET status 0 status_code)
      list(GET status 1 status_string)
-    
+
      if(NOT status_code EQUAL 0)
          message(FATAL_ERROR "error: downloading '${REGRESSIONTEST_URL}' failed
  status_code: ${status_code}
@@ -60,9 +72,6 @@ status_string: ${status_string}
  log: ${log}")
      endif()
  
-    set(REGRESSIONTEST_PATH
-        "${CMAKE_CURRENT_BINARY_DIR}/regressiontests-${REGRESSIONTEST_VERSION}"
-        CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
      file(REMOVE_RECURSE "${REGRESSIONTEST_PATH}") #delete potential prior folder
      execute_process(COMMAND ${CMAKE_COMMAND} -E tar xf "${REGRESSIONTEST_FILE}"
          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Mon, 29 Apr 2013 11:20:53 +0000 (13:20 +0200)
include/mdrun.h		patch \| blob \| history
src/gmxlib/selection/compiler.c		patch \| blob \| history
src/gmxlib/selection/evaluate.c		patch \| blob \| history
src/kernel/mdrun.c		patch \| blob \| history
src/kernel/runner.c		patch \| blob \| history
src/mdlib/coupling.c		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh		patch \| blob \| history
src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history
src/mdlib/sim_util.c		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history