GPU Ewald table kernel now uses CPU spacing

author Berk Hess <hess@kth.se>

Mon, 15 Jun 2015 21:56:26 +0000 (23:56 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Wed, 24 Jun 2015 20:55:27 +0000 (22:55 +0200)
author Berk Hess <hess@kth.se>
Mon, 15 Jun 2015 21:56:26 +0000 (23:56 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 24 Jun 2015 20:55:27 +0000 (22:55 +0200)
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp

index 2d5ff5e6452aed2cbcfe96c54c3adc0868158d92..fab16ec2b7c201f2a16159994ddb29e9ca84458f 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -528,7 +528,6 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
      double       cycles_fast;
      char         buf[STRLEN], sbuf[22];
      real         rtab;
-    gmx_bool     bUsesSimpleTables = TRUE;
  
      if (pme_lb->stage == pme_lb->nstage)
      {
@@ -772,7 +771,9 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
          }
      }
  
-    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
+    /* We always re-initialize the tables whether they are used or not */
+    init_interaction_const_tables(NULL, ic, rtab);
+
      nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
  
      /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
@@ -794,12 +795,6 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
      }
  #endif  /* GMX_THREAD_MPI */
  
-    /* Usually we won't need the simple tables with GPUs.
-     * But we do with hybrid acceleration and with free energy.
-     * To avoid bugs, we always re-initialize the simple tables here.
-     */
-    init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
-
      if (!pme_lb->bSepPMERanks)
      {
          if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h

index 4014b5740e689a0a21b2729f0f2905c1e9d96d77..fa5c22cff41ea6a0a92d868b5983e5db28a368cf 100644 (file)
--- a/src/gromacs/legacyheaders/force.h
+++ b/src/gromacs/legacyheaders/force.h
@@ -155,12 +155,8 @@ gmx_bool uses_simple_tables(int                        cutoff_scheme,
  
  void init_interaction_const_tables(FILE                *fp,
                                     interaction_const_t *ic,
-                                   gmx_bool             bSimpleTable,
                                     real                 rtab);
-/* Initializes the tables in the interaction constant data structure.
- * Setting verlet_kernel_type to -1 always initializes tables for
- * use with group kernels.
- */
+/* Initializes the tables in the interaction constant data structure. */
  
  void init_forcerec(FILE              *fplog,
                     const output_env_t oenv,
diff --git a/src/gromacs/legacyheaders/types/interaction_const.h b/src/gromacs/legacyheaders/types/interaction_const.h

index 319d6c2a61995b79fcac08c7636bdf1c863d9658..e1016d26d14c7d3bab2d441f6ddee3be4f62790c 100644 (file)
--- a/src/gromacs/legacyheaders/types/interaction_const.h
+++ b/src/gromacs/legacyheaders/types/interaction_const.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -68,6 +68,8 @@ typedef struct {
  } switch_consts_t;
  
  typedef struct {
+    int             cutoff_scheme;
+
      /* VdW */
      int             vdwtype;
      int             vdw_modifier;
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index e5becd42053c11141216b368d0dd479f524cf4ff..7cde5b73b295ba129c457201269aab2f70542406 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -43,6 +43,8 @@
  #include <stdlib.h>
  #include <string.h>
  
+#include <algorithm>
+
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/ewald/ewald.h"
  #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
@@ -1833,27 +1835,24 @@ gmx_bool uses_simple_tables(int                 cutoff_scheme,
  }
  
  static void init_ewald_f_table(interaction_const_t *ic,
-                               gmx_bool             bUsesSimpleTables,
                                 real                 rtab)
  {
      real maxr;
  
-    if (bUsesSimpleTables)
-    {
-        /* Get the Ewald table spacing based on Coulomb and/or LJ
-         * Ewald coefficients and rtol.
-         */
-        ic->tabq_scale = ewald_spline3_table_scale(ic);
+    /* Get the Ewald table spacing based on Coulomb and/or LJ
+     * Ewald coefficients and rtol.
+     */
+    ic->tabq_scale = ewald_spline3_table_scale(ic);
  
-        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
-        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
+    if (ic->cutoff_scheme == ecutsVERLET)
+    {
+        maxr = ic->rcoulomb;
      }
      else
      {
-        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
-        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
-        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+        maxr = std::max(ic->rcoulomb, rtab);
      }
+    ic->tabq_size  = static_cast<int>(maxr*ic->tabq_scale) + 2;
  
      sfree_aligned(ic->tabq_coul_FDV0);
      sfree_aligned(ic->tabq_coul_F);
@@ -1885,12 +1884,11 @@ static void init_ewald_f_table(interaction_const_t *ic,
  
  void init_interaction_const_tables(FILE                *fp,
                                     interaction_const_t *ic,
-                                   gmx_bool             bUsesSimpleTables,
                                     real                 rtab)
  {
      if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
      {
-        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+        init_ewald_f_table(ic, rtab);
  
          if (fp != NULL)
          {
@@ -1957,6 +1955,8 @@ init_interaction_const(FILE                       *fp,
  
      snew(ic, 1);
  
+    ic->cutoff_scheme   = fr->cutoff_scheme;
+
      /* Just allocate something so we can free it */
      snew_aligned(ic->tabq_coul_FDV0, 16, 32);
      snew_aligned(ic->tabq_coul_F, 16, 32);
@@ -3227,10 +3227,9 @@ void init_forcerec(FILE              *fp,
          init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
      }
  
+    init_interaction_const_tables(fp, fr->ic, rtab);
+
      initialize_gpu_constants(cr, fr->ic, fr->nbv);
-    init_interaction_const_tables(fp, fr->ic,
-                                  uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1),
-                                  rtab);
  
      if (ir->eDispCorr != edispcNO)
      {
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h

index d685a453c06396e214c37299813a32012d2e7ca2..3c5da829321578e51da66c7e089a7640fb313a5a 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -78,10 +78,6 @@ extern "C" {
  #define NBNXN_AVOID_SING_R2_INC  1.0e-36
  #endif
  
-/* Coulomb force table size chosen such that it fits along the non-bonded
-   parameters in the texture cache. */
-#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536
-
  
  /* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
  #define STRIDE_XYZ   3
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index 1cd9b4aa1368fa98d69f19825594c39b771df7cb..3c2b38bcb272712ad3935cc88566ff68cf407075 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -52,7 +52,6 @@
  #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  #include "gromacs/legacyheaders/gmx_detect_hardware.h"
-#include "gromacs/legacyheaders/tables.h"
  #include "gromacs/legacyheaders/typedefs.h"
  #include "gromacs/legacyheaders/types/enums.h"
  #include "gromacs/legacyheaders/types/force_flags.h"
@@ -114,74 +113,65 @@ static void md_print_warn(FILE       *fplog,
  /* Fw. decl. */
  static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
  
+/* Fw. decl, */
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
+                                          const gmx_device_info_t *dev_info);
+
  
  /*! Tabulates the Ewald Coulomb force and initializes the size/scale
      and the table GPU array. If called with an already allocated table,
      it just re-uploads the table.
   */
-static void init_ewald_coulomb_force_table(cu_nbparam_t            *nbp,
-                                           const gmx_device_info_t *dev_info)
+static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
+                                           cu_nbparam_t              *nbp,
+                                           const gmx_device_info_t   *dev_info)
  {
-    float       *ftmp, *coul_tab;
-    int          tabsize;
-    double       tabscale;
+    float       *coul_tab;
      cudaError_t  stat;
  
-    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
-    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
-    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
-
-    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
-
-    table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
-                                1/tabscale, nbp->ewald_beta, v_q_ewald_lr);
-
-    /* If the table pointer == NULL the table is generated the first time =>
-       the array pointer will be saved to nbparam and the texture is bound.
-     */
-    coul_tab = nbp->coulomb_tab;
-    if (coul_tab == NULL)
+    if (nbp->coulomb_tab != NULL)
      {
-        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
-        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
+        nbnxn_cuda_free_nbparam_table(nbp, dev_info);
+    }
+
+    stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
+    CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
  
-        nbp->coulomb_tab = coul_tab;
+    nbp->coulomb_tab = coul_tab;
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
-        {
-            cudaResourceDesc rd;
-            memset(&rd, 0, sizeof(rd));
-            rd.resType                  = cudaResourceTypeLinear;
-            rd.res.linear.devPtr        = nbp->coulomb_tab;
-            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
-            rd.res.linear.desc.x        = 32;
-            rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (dev_info->prop.major >= 3)
+    {
+        cudaResourceDesc rd;
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                  = cudaResourceTypeLinear;
+        rd.res.linear.devPtr        = nbp->coulomb_tab;
+        rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+        rd.res.linear.desc.x        = 32;
+        rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
  
-            cudaTextureDesc td;
-            memset(&td, 0, sizeof(td));
-            td.readMode                 = cudaReadModeElementType;
-            stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
-            CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
-        }
-        else
+        cudaTextureDesc td;
+        memset(&td, 0, sizeof(td));
+        td.readMode                 = cudaReadModeElementType;
+        stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
+        CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
+    }
+    else
  #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
-        {
-            GMX_UNUSED_VALUE(dev_info);
-            cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
-                                   coul_tab, &cd, tabsize*sizeof(*coul_tab));
-            CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
-        }
+    {
+        GMX_UNUSED_VALUE(dev_info);
+        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+                               coul_tab, &cd,
+                               ic->tabq_size*sizeof(*coul_tab));
+        CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
      }
  
-    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
-
-    nbp->coulomb_tab_size     = tabsize;
-    nbp->coulomb_tab_scale    = tabscale;
+    cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
  
-    pfree(ftmp);
+    nbp->coulomb_tab_size     = ic->tabq_size;
+    nbp->coulomb_tab_scale    = ic->tabq_scale;
  }
  
  
@@ -362,7 +352,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
      nbp->coulomb_tab = NULL;
      if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
      {
-        init_ewald_coulomb_force_table(nbp, dev_info);
+        init_ewald_coulomb_force_table(ic, nbp, dev_info);
      }
  
      nnbfp      = 2*ntypes*ntypes;
@@ -448,7 +438,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
      nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                   nb->dev_info);
  
-    init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
+    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
  }
  
  /*! Initializes the pair list data structure. */
@@ -923,6 +913,31 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
      }
  }
  
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
+                                          const gmx_device_info_t *dev_info)
+{
+    cudaError_t stat;
+
+    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
+    {
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            GMX_UNUSED_VALUE(dev_info);
+            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+        }
+        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+    }
+}
+
  void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
  {
      cudaError_t      stat;
@@ -950,24 +965,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
      plist_nl    = nb->plist[eintNonlocal];
      timers      = nb->timers;
  
-    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
-    {
-
-#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (nb->dev_info->prop.major >= 3)
-        {
-            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
-            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
-        }
-        else
-#endif
-        {
-            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
-            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
-        }
-        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
-    }
+    nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
  
      stat = cudaEventDestroy(nb->nonlocal_done);
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
author	Berk Hess <hess@kth.se>
	Mon, 15 Jun 2015 21:56:26 +0000 (23:56 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Wed, 24 Jun 2015 20:55:27 +0000 (22:55 +0200)
src/gromacs/ewald/pme-load-balancing.cpp		patch \| blob \| history
src/gromacs/legacyheaders/force.h		patch \| blob \| history
src/gromacs/legacyheaders/types/interaction_const.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_consts.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history