From 132373bd62ba7b1190952717e097d652ba1f68c0 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Mon, 15 Jun 2015 23:56:26 +0200
Subject: [PATCH] GPU Ewald table kernel now uses CPU spacing

The tabulated Ewald GPU kernels (only used by default on Fermi),
used a fixed table spacing. Now the same adaptive spacing as on the
CPU is used. With default settings this leads to a coarser table.

Change-Id: I35f0c89e00301ac7e1260b1c0dcc1604300da5aa
---
 src/gromacs/ewald/pme-load-balancing.cpp      |  11 +-
 src/gromacs/legacyheaders/force.h             |   6 +-
 .../legacyheaders/types/interaction_const.h   |   4 +-
 src/gromacs/mdlib/forcerec.cpp                |  33 ++--
 src/gromacs/mdlib/nbnxn_consts.h              |   4 -
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  | 142 +++++++++---------
 6 files changed, 93 insertions(+), 107 deletions(-)

diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp
index 2d5ff5e645..fab16ec2b7 100644
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -528,7 +528,6 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
     double       cycles_fast;
     char         buf[STRLEN], sbuf[22];
     real         rtab;
-    gmx_bool     bUsesSimpleTables = TRUE;
 
     if (pme_lb->stage == pme_lb->nstage)
     {
@@ -772,7 +771,9 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
         }
     }
 
-    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
+    /* We always re-initialize the tables whether they are used or not */
+    init_interaction_const_tables(NULL, ic, rtab);
+
     nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
 
     /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
@@ -794,12 +795,6 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
     }
 #endif  /* GMX_THREAD_MPI */
 
-    /* Usually we won't need the simple tables with GPUs.
-     * But we do with hybrid acceleration and with free energy.
-     * To avoid bugs, we always re-initialize the simple tables here.
-     */
-    init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
-
     if (!pme_lb->bSepPMERanks)
     {
         if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h
index 4014b5740e..fa5c22cff4 100644
--- a/src/gromacs/legacyheaders/force.h
+++ b/src/gromacs/legacyheaders/force.h
@@ -155,12 +155,8 @@ gmx_bool uses_simple_tables(int                        cutoff_scheme,
 
 void init_interaction_const_tables(FILE                *fp,
                                    interaction_const_t *ic,
-                                   gmx_bool             bSimpleTable,
                                    real                 rtab);
-/* Initializes the tables in the interaction constant data structure.
- * Setting verlet_kernel_type to -1 always initializes tables for
- * use with group kernels.
- */
+/* Initializes the tables in the interaction constant data structure. */
 
 void init_forcerec(FILE              *fplog,
                    const output_env_t oenv,
diff --git a/src/gromacs/legacyheaders/types/interaction_const.h b/src/gromacs/legacyheaders/types/interaction_const.h
index 319d6c2a61..e1016d26d1 100644
--- a/src/gromacs/legacyheaders/types/interaction_const.h
+++ b/src/gromacs/legacyheaders/types/interaction_const.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -68,6 +68,8 @@ typedef struct {
 } switch_consts_t;
 
 typedef struct {
+    int             cutoff_scheme;
+
     /* VdW */
     int             vdwtype;
     int             vdw_modifier;
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index e5becd4205..7cde5b73b2 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -43,6 +43,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <algorithm>
+
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/ewald/ewald.h"
 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
@@ -1833,27 +1835,24 @@ gmx_bool uses_simple_tables(int                 cutoff_scheme,
 }
 
 static void init_ewald_f_table(interaction_const_t *ic,
-                               gmx_bool             bUsesSimpleTables,
                                real                 rtab)
 {
     real maxr;
 
-    if (bUsesSimpleTables)
-    {
-        /* Get the Ewald table spacing based on Coulomb and/or LJ
-         * Ewald coefficients and rtol.
-         */
-        ic->tabq_scale = ewald_spline3_table_scale(ic);
+    /* Get the Ewald table spacing based on Coulomb and/or LJ
+     * Ewald coefficients and rtol.
+     */
+    ic->tabq_scale = ewald_spline3_table_scale(ic);
 
-        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
-        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
+    if (ic->cutoff_scheme == ecutsVERLET)
+    {
+        maxr = ic->rcoulomb;
     }
     else
     {
-        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
-        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
-        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+        maxr = std::max(ic->rcoulomb, rtab);
     }
+    ic->tabq_size  = static_cast<int>(maxr*ic->tabq_scale) + 2;
 
     sfree_aligned(ic->tabq_coul_FDV0);
     sfree_aligned(ic->tabq_coul_F);
@@ -1885,12 +1884,11 @@ static void init_ewald_f_table(interaction_const_t *ic,
 
 void init_interaction_const_tables(FILE                *fp,
                                    interaction_const_t *ic,
-                                   gmx_bool             bUsesSimpleTables,
                                    real                 rtab)
 {
     if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
     {
-        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+        init_ewald_f_table(ic, rtab);
 
         if (fp != NULL)
         {
@@ -1957,6 +1955,8 @@ init_interaction_const(FILE                       *fp,
 
     snew(ic, 1);
 
+    ic->cutoff_scheme   = fr->cutoff_scheme;
+
     /* Just allocate something so we can free it */
     snew_aligned(ic->tabq_coul_FDV0, 16, 32);
     snew_aligned(ic->tabq_coul_F, 16, 32);
@@ -3227,10 +3227,9 @@ void init_forcerec(FILE              *fp,
         init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
     }
 
+    init_interaction_const_tables(fp, fr->ic, rtab);
+
     initialize_gpu_constants(cr, fr->ic, fr->nbv);
-    init_interaction_const_tables(fp, fr->ic,
-                                  uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1),
-                                  rtab);
 
     if (ir->eDispCorr != edispcNO)
     {
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h
index d685a453c0..3c5da82932 100644
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -78,10 +78,6 @@ extern "C" {
 #define NBNXN_AVOID_SING_R2_INC  1.0e-36
 #endif
 
-/* Coulomb force table size chosen such that it fits along the non-bonded
-   parameters in the texture cache. */
-#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536
-
 
 /* Strides for x/f with xyz and xyzq coordinate (and charge) storage */
 #define STRIDE_XYZ   3
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 1cd9b4aa13..3c2b38bcb2 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -52,7 +52,6 @@
 #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
-#include "gromacs/legacyheaders/tables.h"
 #include "gromacs/legacyheaders/typedefs.h"
 #include "gromacs/legacyheaders/types/enums.h"
 #include "gromacs/legacyheaders/types/force_flags.h"
@@ -114,74 +113,65 @@ static void md_print_warn(FILE       *fplog,
 /* Fw. decl. */
 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 
+/* Fw. decl, */
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
+                                          const gmx_device_info_t *dev_info);
+
 
 /*! Tabulates the Ewald Coulomb force and initializes the size/scale
     and the table GPU array. If called with an already allocated table,
     it just re-uploads the table.
  */
-static void init_ewald_coulomb_force_table(cu_nbparam_t            *nbp,
-                                           const gmx_device_info_t *dev_info)
+static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
+                                           cu_nbparam_t              *nbp,
+                                           const gmx_device_info_t   *dev_info)
 {
-    float       *ftmp, *coul_tab;
-    int          tabsize;
-    double       tabscale;
+    float       *coul_tab;
     cudaError_t  stat;
 
-    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
-    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
-    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
-
-    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
-
-    table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
-                                1/tabscale, nbp->ewald_beta, v_q_ewald_lr);
-
-    /* If the table pointer == NULL the table is generated the first time =>
-       the array pointer will be saved to nbparam and the texture is bound.
-     */
-    coul_tab = nbp->coulomb_tab;
-    if (coul_tab == NULL)
+    if (nbp->coulomb_tab != NULL)
     {
-        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
-        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
+        nbnxn_cuda_free_nbparam_table(nbp, dev_info);
+    }
+
+    stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab));
+    CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
 
-        nbp->coulomb_tab = coul_tab;
+    nbp->coulomb_tab = coul_tab;
 
 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
-        {
-            cudaResourceDesc rd;
-            memset(&rd, 0, sizeof(rd));
-            rd.resType                  = cudaResourceTypeLinear;
-            rd.res.linear.devPtr        = nbp->coulomb_tab;
-            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
-            rd.res.linear.desc.x        = 32;
-            rd.res.linear.sizeInBytes   = tabsize*sizeof(*coul_tab);
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (dev_info->prop.major >= 3)
+    {
+        cudaResourceDesc rd;
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                  = cudaResourceTypeLinear;
+        rd.res.linear.devPtr        = nbp->coulomb_tab;
+        rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+        rd.res.linear.desc.x        = 32;
+        rd.res.linear.sizeInBytes   = ic->tabq_size*sizeof(*coul_tab);
 
-            cudaTextureDesc td;
-            memset(&td, 0, sizeof(td));
-            td.readMode                 = cudaReadModeElementType;
-            stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
-            CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
-        }
-        else
+        cudaTextureDesc td;
+        memset(&td, 0, sizeof(td));
+        td.readMode                 = cudaReadModeElementType;
+        stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL);
+        CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed");
+    }
+    else
 #endif  /* HAVE_CUDA_TEXOBJ_SUPPORT */
-        {
-            GMX_UNUSED_VALUE(dev_info);
-            cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
-            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
-                                   coul_tab, &cd, tabsize*sizeof(*coul_tab));
-            CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
-        }
+    {
+        GMX_UNUSED_VALUE(dev_info);
+        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+                               coul_tab, &cd,
+                               ic->tabq_size*sizeof(*coul_tab));
+        CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed");
     }
 
-    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
-
-    nbp->coulomb_tab_size     = tabsize;
-    nbp->coulomb_tab_scale    = tabscale;
+    cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab));
 
-    pfree(ftmp);
+    nbp->coulomb_tab_size     = ic->tabq_size;
+    nbp->coulomb_tab_scale    = ic->tabq_scale;
 }
 
 
@@ -362,7 +352,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
     nbp->coulomb_tab = NULL;
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        init_ewald_coulomb_force_table(nbp, dev_info);
+        init_ewald_coulomb_force_table(ic, nbp, dev_info);
     }
 
     nnbfp      = 2*ntypes*ntypes;
@@ -448,7 +438,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                  nb->dev_info);
 
-    init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
+    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
 }
 
 /*! Initializes the pair list data structure. */
@@ -923,6 +913,31 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
     }
 }
 
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
+                                          const gmx_device_info_t *dev_info)
+{
+    cudaError_t stat;
+
+    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
+    {
+#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
+        }
+        else
+#endif
+        {
+            GMX_UNUSED_VALUE(dev_info);
+            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+        }
+        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+    }
+}
+
 void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 {
     cudaError_t      stat;
@@ -950,24 +965,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     plist_nl    = nb->plist[eintNonlocal];
     timers      = nb->timers;
 
-    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
-    {
-
-#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (nb->dev_info->prop.major >= 3)
-        {
-            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
-            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
-        }
-        else
-#endif
-        {
-            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
-            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
-        }
-        cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
-    }
+    nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
 
     stat = cudaEventDestroy(nb->nonlocal_done);
     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-- 
2.22.0