Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_data_mgmt.cu
index fd75a3d1f0cc08c6bc4cf2015e65bfa2583cea47..6612d13097c1625da9b67b864b5581ec51179fa3 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "nbnxn_cuda_data_mgmt.h"
+
+#include "config.h"
 
-#include <stdlib.h>
-#include <stdio.h>
 #include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include <cuda.h>
 
-#include "gmx_fatal.h"
-#include "smalloc.h"
-#include "tables.h"
-#include "typedefs.h"
-#include "types/nb_verlet.h"
-#include "types/interaction_const.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-#include "gmx_detect_hardware.h"
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gpu_utils.h"
+#include "gromacs/legacyheaders/pmalloc_cuda.h"
+#include "gromacs/legacyheaders/tables.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/common.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
 
 #include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "pmalloc_cuda.h"
-#include "gpu_utils.h"
-
-#include "gromacs/utility/common.h"
 
 static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
 
@@ -71,6 +75,7 @@ static unsigned int gpu_min_ci_balanced_factor = 40;
 /* Functions from nbnxn_cuda.cu */
 extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
 
 /* We should actually be using md_print_warn in md_logging.c,
@@ -124,7 +129,7 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
     pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
 
     table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
-                                1/tabscale, nbp->ewald_beta);
+                                1/tabscale, nbp->ewald_beta, v_q_ewald_lr);
 
     /* If the table pointer == NULL the table is generated the first time =>
        the array pointer will be saved to nbparam and the texture is bound.
@@ -257,6 +262,27 @@ static int pick_ewald_kernel_type(bool                   bTwinCut,
     return kernel_type;
 }
 
+/*! Copies all parameters related to the cut-off from ic to nbp */
+static void set_cutoff_parameters(cu_nbparam_t              *nbp,
+                                  const interaction_const_t *ic)
+{
+    nbp->ewald_beta       = ic->ewaldcoeff_q;
+    nbp->sh_ewald         = ic->sh_ewald;
+    nbp->epsfac           = ic->epsfac;
+    nbp->two_k_rf         = 2.0 * ic->k_rf;
+    nbp->c_rf             = ic->c_rf;
+    nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq         = ic->rlist * ic->rlist;
+
+    nbp->sh_lj_ewald      = ic->sh_lj_ewald;
+    nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
+
+    nbp->rvdw_switch      = ic->rvdw_switch;
+    nbp->dispersion_shift = ic->dispersion_shift;
+    nbp->repulsion_shift  = ic->repulsion_shift;
+    nbp->vdw_switch       = ic->vdw_switch;
+}
 
 /*! Initializes the nonbonded parameter data structure. */
 static void init_nbparam(cu_nbparam_t              *nbp,
@@ -265,19 +291,48 @@ static void init_nbparam(cu_nbparam_t              *nbp,
                          const cuda_dev_info_t     *dev_info)
 {
     cudaError_t stat;
-    int         ntypes, nnbfp;
+    int         ntypes, nnbfp, nnbfp_comb;
 
     ntypes  = nbat->ntype;
 
-    nbp->ewald_beta  = ic->ewaldcoeff_q;
-    nbp->sh_ewald    = ic->sh_ewald;
-    nbp->epsfac      = ic->epsfac;
-    nbp->two_k_rf    = 2.0 * ic->k_rf;
-    nbp->c_rf        = ic->c_rf;
-    nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq    = ic->rlist * ic->rlist;
-    nbp->sh_invrc6   = ic->sh_invrc6;
+    set_cutoff_parameters(nbp, ic);
+
+    if (ic->vdwtype == evdwCUT)
+    {
+        switch (ic->vdw_modifier)
+        {
+            case eintmodNONE:
+            case eintmodPOTSHIFT:
+                nbp->vdwtype = evdwCuCUT;
+                break;
+            case eintmodFORCESWITCH:
+                nbp->vdwtype = evdwCuFSWITCH;
+                break;
+            case eintmodPOTSWITCH:
+                nbp->vdwtype = evdwCuPSWITCH;
+                break;
+            default:
+                gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+                break;
+        }
+    }
+    else if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            assert(nbat->comb_rule == ljcrGEOM);
+            nbp->vdwtype = evdwCuEWALDGEOM;
+        }
+        else
+        {
+            assert(nbat->comb_rule == ljcrLB);
+            nbp->vdwtype = evdwCuEWALDLB;
+        }
+    }
+    else
+    {
+        gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
+    }
 
     if (ic->eeltype == eelCUT)
     {
@@ -305,16 +360,28 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         init_ewald_coulomb_force_table(nbp, dev_info);
     }
 
-    nnbfp = 2*ntypes*ntypes;
+    nnbfp      = 2*ntypes*ntypes;
+    nnbfp_comb = 2*ntypes;
+
     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
+
+    if (ic->vdwtype == evdwPME)
+    {
+        stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+        CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
+        cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+    }
+
 #ifdef TEXOBJ_SUPPORTED
     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
     if (dev_info->prop.major >= 3)
     {
         cudaResourceDesc rd;
+        cudaTextureDesc  td;
+
         memset(&rd, 0, sizeof(rd));
         rd.resType                  = cudaResourceTypeLinear;
         rd.res.linear.devPtr        = nbp->nbfp;
@@ -322,11 +389,25 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         rd.res.linear.desc.x        = 32;
         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 
-        cudaTextureDesc td;
         memset(&td, 0, sizeof(td));
         td.readMode                 = cudaReadModeElementType;
         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+
+        if (ic->vdwtype == evdwPME)
+        {
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->nbfp_comb;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
+
+            memset(&td, 0, sizeof(td));
+            td.readMode = cudaReadModeElementType;
+            stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
+        }
     }
     else
 #endif
@@ -335,19 +416,29 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+
+        if (ic->vdwtype == evdwPME)
+        {
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
+                                   nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+            CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
+        }
     }
 }
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
-                                         const interaction_const_t *ic)
+void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                         const interaction_const_t   *ic)
 {
-    cu_nbparam_t *nbp = cu_nb->nbparam;
+    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA)
+    {
+        return;
+    }
+    nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv;
+    cu_nbparam_t    *nbp   = cu_nb->nbparam;
 
-    nbp->rlist_sq       = ic->rlist * ic->rlist;
-    nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
-    nbp->ewald_beta     = ic->ewaldcoeff_q;
+    set_cutoff_parameters(nbp, ic);
 
     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
                                                  cu_nb->dev_info);
@@ -488,7 +579,7 @@ void nbnxn_cuda_init(FILE                 *fplog,
          * priorities, because we are querying the priority range which in this
          * case will be a single value.
          */
-#if CUDA_VERSION >= 5500
+#if CUDA_VERSION >= 5050
         {
             int highest_priority;
             stat = cudaDeviceGetStreamPriorityRange(NULL, &highest_priority);
@@ -920,6 +1011,24 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
     }
     cu_free_buffered(nbparam->nbfp);
 
+    if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
+    {
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (cu_nb->dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
+        }
+        else
+#endif
+        {
+            stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
+        }
+        cu_free_buffered(nbparam->nbfp_comb);
+    }
+
     stat = cudaFree(atdat->shift_vec);
     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
     stat = cudaFree(atdat->fshift);
@@ -975,11 +1084,11 @@ wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
     return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
 }
 
-void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
 {
-    if (cu_nb->bDoTime)
+    if (nbv->cu_nbv && nbv->cu_nbv->bDoTime)
     {
-        init_timings(cu_nb->timings);
+        init_timings(nbv->cu_nbv->timings);
     }
 }