remove legacy CUDA non-bonded kernels

author Szilard Pall <pall.szilard@gmail.com>

Sun, 1 Dec 2013 21:53:42 +0000 (22:53 +0100)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Mon, 2 Dec 2013 23:19:26 +0000 (00:19 +0100)
author Szilard Pall <pall.szilard@gmail.com>
Sun, 1 Dec 2013 21:53:42 +0000 (22:53 +0100)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 2 Dec 2013 23:19:26 +0000 (00:19 +0100)
diff --git a/manual/install.tex b/manual/install.tex

index 715f1b68a9753432b7ed2e7b0d35950822642120..525453583de2f61a44570b80f646628325b011bf 100644 (file)
--- a/manual/install.tex
+++ b/manual/install.tex
@@ -227,16 +227,11 @@ you should consult your local documentation for details.
          {\tt .cpt} file.
  \item   {\tt GMX_CAPACITY}: the maximum capacity of charge groups per
          processor when using particle decomposition.
          {\tt .cpt} file.
  \item   {\tt GMX_CAPACITY}: the maximum capacity of charge groups per
          processor when using particle decomposition.
-\item   {\tt GMX_CUDA_NB_DEFAULT}: Force the use of the default CUDA non-bonded kernels instead of
-        the legacy ones; mutually exclusive of {\tt GMX_CUDA_NB_LEGACY}.
  \item   {\tt GMX_CUDA_NB_EWALD_TWINCUT}: force the use of twin-range cutoff kernel even if {\tt rvdw} =
          {\tt rcoulomb} after PP-PME load balancing. The switch to twin-range kernels is automated,
          so this variable should be used only for benchmarking.
  \item   {\tt GMX_CUDA_NB_ANA_EWALD}: force the use of analytical Ewald kernels. Should be used only for benchmarking.
  \item   {\tt GMX_CUDA_NB_TAB_EWALD}: force the use of tabulated Ewald kernels. Should be used only for benchmarking.
  \item   {\tt GMX_CUDA_NB_EWALD_TWINCUT}: force the use of twin-range cutoff kernel even if {\tt rvdw} =
          {\tt rcoulomb} after PP-PME load balancing. The switch to twin-range kernels is automated,
          so this variable should be used only for benchmarking.
  \item   {\tt GMX_CUDA_NB_ANA_EWALD}: force the use of analytical Ewald kernels. Should be used only for benchmarking.
  \item   {\tt GMX_CUDA_NB_TAB_EWALD}: force the use of tabulated Ewald kernels. Should be used only for benchmarking.
-\item   {\tt GMX_CUDA_NB_LEGACY}: Force the use of the legacy CUDA non-bonded kernels, which are
-        the default when using the CUDA toolkit versions 3.2 or 4.0 on Fermi NVIDIA GPUs (compute capability 2.x);
-        mutually exclusive of {\tt GMX_CUDA_NB_DEFAULT}.
  \item   {\tt GMX_CUDA_STREAMSYNC}: force the use of cudaStreamSynchronize on ECC-enabled GPUs, which leads
          to performance loss due to a known CUDA driver bug present in API v5.0 NVIDIA drivers (pre-30x.xx).
          Cannot be set simultaneously with {\tt GMX_NO_CUDA_STREAMSYNC}.
  \item   {\tt GMX_CUDA_STREAMSYNC}: force the use of cudaStreamSynchronize on ECC-enabled GPUs, which leads
          to performance loss due to a known CUDA driver bug present in API v5.0 NVIDIA drivers (pre-30x.xx).
          Cannot be set simultaneously with {\tt GMX_NO_CUDA_STREAMSYNC}.
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 511d6a3f9df4381b39ed49fce9eb703e4a9e13bd..3c1a235fcdc2a082def7ced35425eb14227b323d 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -169,74 +169,35 @@ nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
        { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
  };
  
        { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
  };
  
-/*! Pointers to the legacy kernels organized in a 3 dim array by:
- *  electrostatics type, energy calculation on/off, and pruning on/off.
- *
- *  Note that the order of electrostatics (1st dimension) has to match the
- *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
- */
-static const nbnxn_cu_kfunc_ptr_t
-nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
-{
-    { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
-      { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
-    { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
-      { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
-    { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
-      { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
-    { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
-      { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
-};
-
  /*! Return a pointer to the kernel version to be executed at the current step. */
  /*! Return a pointer to the kernel version to be executed at the current step. */
-static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
-                                                       bool bDoEne, bool bDoPrune)
+static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
+                                                       bool bDoEne,
+                                                       bool bDoPrune)
  {
  {
-    assert(kver < eNbnxnCuKNR);
      assert(eeltype < eelCuNR);
  
      assert(eeltype < eelCuNR);
  
-    if (NBNXN_KVER_LEGACY(kver))
-    {
-        /* no analytical Ewald with legacy kernels */
-        assert(eeltype <= eelCuEWALD_TAB_TWIN);
-
-        return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
-    }
-    else
-    {
-        return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
-    }
+    return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
  }
  
  }
  
-/*! Calculates the amount of shared memory required for kernel version in use. */
-static inline int calc_shmem_required(int kver)
+/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
+static inline int calc_shmem_required()
  {
      int shmem;
  
      /* size of shmem (force-buffers/xq/atom type preloading) */
  {
      int shmem;
  
      /* size of shmem (force-buffers/xq/atom type preloading) */
-    if (NBNXN_KVER_LEGACY(kver))
-    {
-        /* i-atom x+q in shared memory */
-        shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
-        /* force reduction buffers in shared memory */
-        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
-    }
-    else
-    {
-        /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
-        /* i-atom x+q in shared memory */
-        shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
-        /* cj in shared memory, for both warps separately */
-        shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+    /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+    /* i-atom x+q in shared memory */
+    shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+    /* cj in shared memory, for both warps separately */
+    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
  #ifdef IATYPE_SHMEM
  #ifdef IATYPE_SHMEM
-        /* i-atom types in shared memory */
-        shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+    /* i-atom types in shared memory */
+    shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
  #endif
  #if __CUDA_ARCH__ < 300
  #endif
  #if __CUDA_ARCH__ < 300
-        /* force reduction buffers in shared memory */
-        shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+    /* force reduction buffers in shared memory */
+    shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
  #endif
  #endif
-    }
  
      return shmem;
  }
  
      return shmem;
  }
@@ -341,14 +302,14 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* get the pointer to the kernel flavor we need to use */
      }
  
      /* get the pointer to the kernel flavor we need to use */
-    nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
+    nb_kernel = select_nbnxn_kernel(nbp->eeltype, bCalcEner,
                                      plist->bDoPrune || always_prune);
  
      /* kernel launch config */
      nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
      dim_block = dim3(CL_SIZE, CL_SIZE, 1);
      dim_grid  = dim3(nblock, 1, 1);
                                      plist->bDoPrune || always_prune);
  
      /* kernel launch config */
      nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
      dim_block = dim3(CL_SIZE, CL_SIZE, 1);
      dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required(cu_nb->kernel_ver);
+    shmem     = calc_shmem_required();
  
      if (debug)
      {
  
      if (debug)
      {
@@ -683,15 +644,6 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
          {
              for (int k = 0; k < nPruneKernelTypes; k++)
              {
          {
              for (int k = 0; k < nPruneKernelTypes; k++)
              {
-                /* Legacy kernel 16/48 kB Shared/L1
-                 * No analytical Ewald!
-                 */
-                if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
-                {
-                    stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
-                    CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
-                }
-
                  if (devinfo->prop.major >= 3)
                  {
                      /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
                  if (devinfo->prop.major >= 3)
                  {
                      /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index a0db5b8db21b9c48b633e553a37b0063d9ff6b8f..6d26a86560f56aa2bf7833de05b400c78071ff52 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -429,77 +429,6 @@ static void init_timings(wallclock_gpu_t *t)
      }
  }
  
      }
  }
  
-/* Decide which kernel version to use (default or legacy) based on:
- *  - CUDA version used for compilation
- *  - non-bonded kernel selector environment variables
- *  - GPU architecture version
- */
-static int pick_nbnxn_kernel_version(FILE            *fplog,
-                                     cuda_dev_info_t *devinfo)
-{
-    bool bForceLegacyKernel, bForceDefaultKernel, bCUDA40, bCUDA32;
-    char sbuf[STRLEN];
-    int  kver;
-
-    /* Legacy kernel (former k2), kept for backward compatibility as it is
-       faster than the default with CUDA 3.2/4.0 on Fermi (not on Kepler). */
-    bForceLegacyKernel  = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
-    /* default kernel (former k3). */
-    bForceDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
-
-    if ((unsigned)(bForceLegacyKernel + bForceDefaultKernel) > 1)
-    {
-        gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
-                  "of the following environment variables: \n"
-                  "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
-    }
-
-    bCUDA32 = bCUDA40 = false;
-#if CUDA_VERSION == 3200
-    bCUDA32 = true;
-    sprintf(sbuf, "3.2");
-#elif CUDA_VERSION == 4000
-    bCUDA40 = true;
-    sprintf(sbuf, "4.0");
-#endif
-
-    /* default is default ;) */
-    kver = eNbnxnCuKDefault;
-
-    /* Consider switching to legacy kernels only on Fermi */
-    if (devinfo->prop.major < 3 && (bCUDA32 || bCUDA40))
-    {
-        /* use legacy kernel unless something else is forced by an env. var */
-        if (bForceDefaultKernel)
-        {
-            md_print_warn(fplog,
-                          "NOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
-                          "      non-bonded kernels perform best. However, the default kernels were\n"
-                          "      selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
-                          "      For best performance upgrade your CUDA toolkit.\n",
-                          sbuf);
-        }
-        else
-        {
-            kver = eNbnxnCuKLegacy;
-        }
-    }
-    else
-    {
-        /* issue note if the non-default kernel is forced by an env. var */
-        if (bForceLegacyKernel)
-        {
-            md_print_warn(fplog,
-                    "NOTE: Legacy non-bonded CUDA kernels selected by the GMX_CUDA_NB_LEGACY\n"
-                    "      env. var. Consider using using the default kernels which should be faster!\n");
-
-            kver = eNbnxnCuKLegacy;
-        }
-    }
-
-    return kver;
-}
-
  void nbnxn_cuda_init(FILE *fplog,
                       nbnxn_cuda_ptr_t *p_cu_nb,
                       const gmx_gpu_info_t *gpu_info,
  void nbnxn_cuda_init(FILE *fplog,
                       nbnxn_cuda_ptr_t *p_cu_nb,
                       const gmx_gpu_info_t *gpu_info,
@@ -697,7 +626,6 @@ void nbnxn_cuda_init(FILE *fplog,
      }
  
      /* set the kernel type for the current GPU */
      }
  
      /* set the kernel type for the current GPU */
-    nb->kernel_ver = pick_nbnxn_kernel_version(fplog, nb->dev_info);
      /* pick L1 cache configuration */
      nbnxn_cuda_set_cacheconfig(nb->dev_info);
  
      /* pick L1 cache configuration */
      nbnxn_cuda_set_cacheconfig(nb->dev_info);
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh

deleted file mode 100644 (file)

index 6570307..0000000
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ /dev/null
@@ -1,388 +0,0 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
- *
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2012, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
- *
- * For more info, check our website at http://www.gromacs.org
- *
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
- */
-
-#include "maths.h"
-/* Note that floating-point constants in CUDA code should be suffixed
- * with f (e.g. 0.5f), to stop the compiler producing intermediate
- * code that is in double precision.
- */
-
-/* Analytical Ewald is not implemented for the legacy kernels (as it is anyway
-   slower than the tabulated kernel on Fermi). */
-#ifdef EL_EWALD_ANA
-#error Trying to generate Analytical Ewald legacy kernels which is not implemented in the legacy kernels!
-#endif
-
-/*
-   Kernel launch parameters:
-    - #blocks   = #pair lists, blockId = pair list Id
-    - #threads  = CL_SIZE^2
-    - shmem     = CL_SIZE^2 * sizeof(float)
-
-    Each thread calculates an i force-component taking one pair of i-j atoms.
- */
-#if __CUDA_ARCH__ >= 350
-__launch_bounds__(64,16)
-#endif
-#ifdef PRUNE_NBL
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_prune_legacy)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _prune_legacy)
-#endif
-#else
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _ener_legacy)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
-#endif
-#endif
-            (const cu_atomdata_t atdat,
-             const cu_nbparam_t nbparam,
-             const cu_plist_t plist,
-             bool bCalcFshift)
-{
-    /* convenience variables */
-    const nbnxn_sci_t *pl_sci   = plist.sci;
-#ifndef PRUNE_NBL
-    const
-#endif
-    nbnxn_cj4_t *pl_cj4         = plist.cj4;
-    const nbnxn_excl_t *excl    = plist.excl;
-    const int *atom_types       = atdat.atom_types;
-    int ntypes                  = atdat.ntypes;
-    const float4 *xq            = atdat.xq;
-    float3 *f                   = atdat.f;
-    const float3 *shift_vec     = atdat.shift_vec;
-    float rcoulomb_sq           = nbparam.rcoulomb_sq;
-#ifdef VDW_CUTOFF_CHECK
-    float rvdw_sq               = nbparam.rvdw_sq;
-    float vdw_in_range;
-#endif
-#ifdef EL_RF
-    float two_k_rf              = nbparam.two_k_rf;
-#endif
-#ifdef EL_EWALD_TAB
-    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
-#endif
-#ifdef PRUNE_NBL
-    float rlist_sq              = nbparam.rlist_sq;
-#endif
-
-#ifdef CALC_ENERGIES
-    float lj_shift    = nbparam.sh_invrc6;
-#ifdef EL_EWALD_TAB
-    float beta        = nbparam.ewald_beta;
-    float ewald_shift = nbparam.sh_ewald;
-#else
-    float c_rf        = nbparam.c_rf;
-#endif
-    float *e_lj       = atdat.e_lj;
-    float *e_el       = atdat.e_el;
-#endif
-
-    /* thread/block/warp id-s */
-    unsigned int tidxi  = threadIdx.x;
-    unsigned int tidxj  = threadIdx.y;
-    unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
-    unsigned int bidx   = blockIdx.x;
-    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
-
-    int sci, ci, cj, ci_offset,
-        ai, aj,
-        cij4_start, cij4_end,
-        typei, typej,
-        i, cii, jm, j4, nsubi, wexcl_idx;
-    float qi, qj_f,
-          r2, inv_r, inv_r2, inv_r6,
-          c6, c12,
-          int_bit,
-#ifdef CALC_ENERGIES
-          E_lj, E_el, E_lj_p,
-#endif
-          F_invr;
-    unsigned int wexcl, imask, mask_ji;
-    float4 xqbuf;
-    float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
-    float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
-    nbnxn_sci_t nb_sci;
-
-    /* shmem buffer for i x+q pre-loading */
-    extern __shared__  float4 xqib[];
-    /* shmem j force buffer */
-    float *f_buf = (float *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
-
-    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
-    sci         = nb_sci.sci;           /* super-cluster */
-    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
-    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
-
-    /* Store the i-atom x and q in shared memory */
-    /* Note: the thread indexing here is inverted with respect to the
-       inner-loop as this results in slightly higher performance */
-    ci = sci * NCL_PER_SUPERCL + tidxi;
-    ai = ci * CL_SIZE + tidxj;
-    xqib[tidxi * CL_SIZE + tidxj] = xq[ai] + shift_vec[nb_sci.shift];
-    __syncthreads();
-
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
-    {
-        fci_buf[ci_offset] = make_float3(0.0f);
-    }
-
-#ifdef CALC_ENERGIES
-    E_lj = 0.0f;
-    E_el = 0.0f;
-
-#if defined EL_EWALD_TAB || defined EL_RF
-    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
-    {
-        /* we have the diagonal: add the charge self interaction energy term */
-        for (i = 0; i < NCL_PER_SUPERCL; i++)
-        {
-            qi    = xqib[i * CL_SIZE + tidxi].w;
-            E_el += qi*qi;
-        }
-        /* divide the self term equally over the j-threads */
-        E_el /= CL_SIZE;
-#ifdef EL_RF
-        E_el *= -nbparam.epsfac*0.5f*c_rf;
-#else
-        E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
-#endif
-    }
-#endif
-#endif
-
-    /* skip central shifts when summing shift forces */
-    if (nb_sci.shift == CENTRAL)
-    {
-        bCalcFshift = false;
-    }
-
-    fshift_buf = make_float3(0.0f);
-
-    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-    for (j4 = cij4_start; j4 < cij4_end; j4++)
-    {
-        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
-        imask       = pl_cj4[j4].imei[widx].imask;
-        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
-
-#ifndef PRUNE_NBL
-        if (imask)
-#endif
-        {
-            /* nvcc >v4.1 doesn't like this loop, it refuses to unroll it */
-#if CUDA_VERSION >= 4010
-            #pragma unroll 4
-#endif
-            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
-            {
-                mask_ji = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
-                if (mask_ji)
-                {
-                    nsubi = __popc(mask_ji);
-
-                    cj      = pl_cj4[j4].cj[jm];
-                    aj      = cj * CL_SIZE + tidxj;
-
-                    /* load j atom data */
-                    xqbuf   = xq[aj];
-                    xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-                    qj_f    = nbparam.epsfac * xqbuf.w;
-                    typej   = atom_types[aj];
-
-                    fcj_buf = make_float3(0.0f);
-
-                    /* loop over the i-clusters in sci */
-                    /* #pragma unroll 8
-                       -- nvcc doesn't like my code, it refuses to unroll it
-                       which is a pity because here unrolling could help.  */
-                    for (cii = 0; cii < nsubi; cii++)
-                    {
-                        i = __ffs(mask_ji) - 1;
-                        mask_ji &= ~(1U << i);
-
-                        ci_offset   = i;    /* i force buffer offset */
-
-                        ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
-                        ai      = ci * CL_SIZE + tidxi;      /* i atom index */
-
-                        /* all threads load an atom from i cluster ci into shmem! */
-                        xqbuf   = xqib[i * CL_SIZE + tidxi];
-                        xi      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-
-                        /* distance between i and j atoms */
-                        rv      = xi - xj;
-                        r2      = norm2(rv);
-
-#ifdef PRUNE_NBL
-                        /* If _none_ of the atoms pairs are in cutoff range,
-                               the bit corresponding to the current
-                               cluster-pair in imask gets set to 0. */
-                        if (!__any(r2 < rlist_sq))
-                        {
-                            imask &= ~(1U << (jm * NCL_PER_SUPERCL + i));
-                        }
-#endif
-
-                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1) ? 1.0f : 0.0f;
-
-                        /* cutoff & exclusion check */
-#if defined EL_EWALD_TAB || defined EL_RF
-                        if (r2 < rcoulomb_sq *
-                            (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
-#else
-                        if (r2 < rcoulomb_sq * int_bit)
-#endif
-                        {
-                            /* load the rest of the i-atom parameters */
-                            qi      = xqbuf.w;
-                            typei   = atom_types[ai];
-
-                            /* LJ 6*C6 and 12*C12 */
-                            c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
-                            c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
-
-                            /* avoid NaN for excluded pairs at r=0 */
-                            r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
-
-                            inv_r   = rsqrt(r2);
-                            inv_r2  = inv_r * inv_r;
-                            inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD_TAB || defined EL_RF
-                            /* We could mask inv_r2, but with Ewald
-                             * masking both inv_r6 and F_invr is faster */
-                            inv_r6  *= int_bit;
-#endif
-
-                            F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
-
-#ifdef CALC_ENERGIES
-                            E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
-#endif
-
-#ifdef VDW_CUTOFF_CHECK
-                            /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
-                            vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                            F_invr  *= vdw_in_range;
-#ifdef CALC_ENERGIES
-                            E_lj_p  *= vdw_in_range;
-#endif
-#endif
-#ifdef CALC_ENERGIES
-                            E_lj    += E_lj_p;
-#endif
-
-
-#ifdef EL_CUTOFF
-                            F_invr  += qi * qj_f * inv_r2 * inv_r;
-#endif
-#ifdef EL_RF
-                            F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
-#endif
-#ifdef EL_EWALD_TAB
-                            F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif /* EL_EWALD_TAB */
-
-#ifdef CALC_ENERGIES
-#ifdef EL_CUTOFF
-                            E_el    += qi * qj_f * (inv_r - c_rf);
-#endif
-#ifdef EL_RF
-                            E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
-#endif
-#ifdef EL_EWALD_TAB
-                            /* 1.0f - erff is faster than erfcf */
-                            E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif
-#endif
-                            f_ij    = rv * F_invr;
-
-                            /* accumulate j forces in registers */
-                            fcj_buf -= f_ij;
-
-                            /* accumulate i forces in registers */
-                            fci_buf[ci_offset] += f_ij;
-                        }
-                    }
-
-                    /* store j forces in shmem */
-                    f_buf[                  tidx] = fcj_buf.x;
-                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
-                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
-
-                    /* reduce j forces */
-                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
-                }
-            }
-#ifdef PRUNE_NBL
-            /* Update the imask with the new one which does not contain the
-               out of range clusters anymore. */
-            pl_cj4[j4].imei[widx].imask = imask;
-#endif
-        }
-    }
-
-    /* reduce i forces */
-    for(ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
-    {
-        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
-        f_buf[                  tidx] = fci_buf[ci_offset].x;
-        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
-        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
-        __syncthreads();
-        reduce_force_i(f_buf, f,
-                       &fshift_buf, bCalcFshift,
-                       tidxi, tidxj, ai);
-        __syncthreads();
-    }
-
-    /* add up local shift forces into global mem */
-    if (bCalcFshift && tidxj == 0)
-    {
-        atomicAdd(&atdat.fshift[nb_sci.shift].x, fshift_buf.x);
-        atomicAdd(&atdat.fshift[nb_sci.shift].y, fshift_buf.y);
-        atomicAdd(&atdat.fshift[nb_sci.shift].z, fshift_buf.z);
-    }
-
-#ifdef CALC_ENERGIES
-    /* flush the energies to shmem and reduce them */
-    f_buf[              tidx] = E_lj;
-    f_buf[FBUF_STRIDE + tidx] = E_el;
-    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-#endif
-}
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index c8566a12ece804dd4ae5f8631012f507d9cb8357..55884cff84111e8daffb57b1b9eca63e139d7ee7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -47,7 +47,6 @@
  /* Analytical plain cut-off kernels */
  #define EL_CUTOFF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
  /* Analytical plain cut-off kernels */
  #define EL_CUTOFF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_CUTOFF
  #undef NB_KERNEL_FUNC_NAME
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_CUTOFF
  #undef NB_KERNEL_FUNC_NAME
@@ -55,13 +54,11 @@
  /* Analytical reaction-field kernels */
  #define EL_RF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
  /* Analytical reaction-field kernels */
  #define EL_RF
  #define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_RF
  #undef NB_KERNEL_FUNC_NAME
  
  /* Analytical Ewald interaction kernels
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_RF
  #undef NB_KERNEL_FUNC_NAME
  
  /* Analytical Ewald interaction kernels
- * NOTE: no legacy kernels with analytical Ewald.
   */
  #define EL_EWALD_ANA
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
   */
  #define EL_EWALD_ANA
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
@@ -70,7 +67,6 @@
  #undef NB_KERNEL_FUNC_NAME
  
  /* Analytical Ewald interaction kernels with twin-range cut-off
  #undef NB_KERNEL_FUNC_NAME
  
  /* Analytical Ewald interaction kernels with twin-range cut-off
- * NOTE: no legacy kernels with analytical Ewald.
   */
  #define EL_EWALD_ANA
  #define VDW_CUTOFF_CHECK
   */
  #define EL_EWALD_ANA
  #define VDW_CUTOFF_CHECK
@@ -83,7 +79,6 @@
  /* Tabulated Ewald interaction kernels */
  #define EL_EWALD_TAB
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
  /* Tabulated Ewald interaction kernels */
  #define EL_EWALD_TAB
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef NB_KERNEL_FUNC_NAME
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef NB_KERNEL_FUNC_NAME
@@ -92,7 +87,6 @@
  #define EL_EWALD_TAB
  #define VDW_CUTOFF_CHECK
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
  #define EL_EWALD_TAB
  #define VDW_CUTOFF_CHECK
  #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef VDW_CUTOFF_CHECK
  #include "nbnxn_cuda_kernel.cuh"
  #undef EL_EWALD_TAB
  #undef VDW_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index 923fb60a67f65fb1de5e1999bc6e110648c2a8c4..ac9bea9dd2a9f7b14892dd2000b2540a0782ec42 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -63,24 +63,12 @@ extern "C" {
   *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
   *
   *  The order of pointers to different electrostatic kernels defined in
   *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
   *
   *  The order of pointers to different electrostatic kernels defined in
- *  nbnxn_cuda.cu by the nb_default_kfunc_ptr and nb_legacy_kfunc_ptr arrays
+ *  nbnxn_cuda.cu by the nb_default_kfunc_ptr array
   *  should match the order of enumerated types below. */
  enum {
      eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
  };
  
   *  should match the order of enumerated types below. */
  enum {
      eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
  };
  
-/** Kernel flavors with different set of optimizations: default for CUDA <=v4.1
- *  compilers and legacy for earlier, 3.2 and 4.0 CUDA compilers. */
-enum {
-    eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKNR
-};
-
-#define NBNXN_KVER_OLD(k)      (k == eNbnxnCuKOld)
-#define NBNXN_KVER_LEGACY(k)   (k == eNbnxnCuKLegacy)
-#define NBNXN_KVER_DEFAULT(k)  (k == eNbnxnCuKDefault)
-
-/* Non-bonded kernel versions. */
-
  /* All structs prefixed with "cu_" hold data used in GPU calculations and
   * are passed to the kernels, except cu_timers_t. */
  typedef struct cu_plist     cu_plist_t;
  /* All structs prefixed with "cu_" hold data used in GPU calculations and
   * are passed to the kernels, except cu_timers_t. */
  typedef struct cu_plist     cu_plist_t;
@@ -191,8 +179,6 @@ struct cu_timers
  struct nbnxn_cuda
  {
      cuda_dev_info_t *dev_info;       /**< CUDA device information                              */
  struct nbnxn_cuda
  {
      cuda_dev_info_t *dev_info;       /**< CUDA device information                              */
-    int              kernel_ver;     /**< The version of the kernel to be executed on the
-                                          device in use, possible values: eNbnxnCuK*           */
      bool             bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
      bool             bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
                                            and not memory polling-based waiting                 */
      bool             bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
      bool             bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
                                            and not memory polling-based waiting                 */
author	Szilard Pall <pall.szilard@gmail.com>
	Sun, 1 Dec 2013 21:53:42 +0000 (22:53 +0100)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Mon, 2 Dec 2013 23:19:26 +0000 (00:19 +0100)
manual/install.tex		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh	[deleted file]	patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history