From 354ebb3e51390bc488869630c583cc590dd61eeb Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Sun, 14 Oct 2018 11:16:39 +0200
Subject: [PATCH] Removed support for NVIDIA CC 2.x devices (codename Fermi)

These are no longer tested or supported, but it is possible that the
OpenCL version of GROMACS will still run on such old devices.

Various code for configuration, the use of texture objects, the use
of shared memory, and the kernel dispatch is now simpler.

Fixes #2408
Fixes #2410
Fixes #2665

Change-Id: Ia7a00e5d6a97f93cd2768beb7ad56b2cce628a6f
---
 CMakeLists.txt                                |   2 +-
 cmake/gmxManageClangCudaConfig.cmake          |   3 -
 cmake/gmxManageGPU.cmake                      |   2 +-
 cmake/gmxManageNvccConfig.cmake               |  15 +-
 docs/install-guide/index.rst                  |   3 +-
 docs/release-notes/removed-functionality.rst  |   6 +
 docs/user-guide/environment-variables.rst     |   2 +-
 src/gromacs/ewald/pme-gather.cu               |   4 +-
 src/gromacs/ewald/pme-gpu-constants.h         |   4 +-
 src/gromacs/ewald/pme-gpu-internal.cpp        |  12 +-
 src/gromacs/ewald/pme-solve.cu                |  58 --
 src/gromacs/ewald/pme-spread.cu               |   2 +-
 src/gromacs/gpu_utils/cuda_arch_utils.cuh     |   9 +-
 src/gromacs/gpu_utils/cudautils.cu            |  36 +-
 src/gromacs/gpu_utils/cudautils.cuh           |   8 +-
 src/gromacs/gpu_utils/gpu_utils.cu            |  15 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    |  48 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h     |   4 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  58 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh    |   9 +-
 .../nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh    | 588 ------------------
 .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh    |  12 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh   |  91 ++-
 src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp     |   9 +-
 24 files changed, 128 insertions(+), 872 deletions(-)
 delete mode 100644 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51272c4401..fbe554aa65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -220,7 +220,7 @@ if(MSVC)
 else()
     set(REQUIRED_CUDA_VERSION 7.0)
 endif()
-set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
+set(REQUIRED_CUDA_COMPUTE_CAPABILITY 3.0)
 
 # OpenCL required version: 1.2 or newer
 set(REQUIRED_OPENCL_MIN_VERSION 1.2)
diff --git a/cmake/gmxManageClangCudaConfig.cmake b/cmake/gmxManageClangCudaConfig.cmake
index 510e819088..542674d56c 100644
--- a/cmake/gmxManageClangCudaConfig.cmake
+++ b/cmake/gmxManageClangCudaConfig.cmake
@@ -64,9 +64,6 @@ if (GMX_CUDA_TARGET_SM)
         list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_${_target}")
     endforeach()
 else()
-    if(CUDA_VERSION VERSION_LESS "9.00") # < 9.0
-        list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_20")
-    endif()
     list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_30")
     list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_35")
     # clang 6.0 + CUDA 9.0 seems to have issues generating code for sm_37
diff --git a/cmake/gmxManageGPU.cmake b/cmake/gmxManageGPU.cmake
index eaec53a18e..6c674e5f5c 100644
--- a/cmake/gmxManageGPU.cmake
+++ b/cmake/gmxManageGPU.cmake
@@ -107,7 +107,7 @@ Compute capability information not available, consult the NVIDIA website:
 https://developer.nvidia.com/cuda-gpus")
     endif()
 
-        set(CUDA_NOTFOUND_MESSAGE "mdrun supports native GPU acceleration on NVIDIA hardware with compute capability >= ${REQUIRED_CUDA_COMPUTE_CAPABILITY} (Fermi or later). This requires the NVIDIA CUDA toolkit, which was not found. Its location can be hinted by setting the CUDA_TOOLKIT_ROOT_DIR CMake option (does not work as an environment variable). The typical location would be /usr/local/cuda[-version]. Note that CPU or GPU acceleration can be selected at runtime.
+        set(CUDA_NOTFOUND_MESSAGE "mdrun supports native GPU acceleration on NVIDIA hardware with compute capability >= ${REQUIRED_CUDA_COMPUTE_CAPABILITY} (Kepler or later). This requires the NVIDIA CUDA toolkit, which was not found. Its location can be hinted by setting the CUDA_TOOLKIT_ROOT_DIR CMake option (does not work as an environment variable). The typical location would be /usr/local/cuda[-version]. Note that CPU or GPU acceleration can be selected at runtime.
 
 ${_msg}")
         unset(_msg)
diff --git a/cmake/gmxManageNvccConfig.cmake b/cmake/gmxManageNvccConfig.cmake
index e82743df0c..5704baaff3 100644
--- a/cmake/gmxManageNvccConfig.cmake
+++ b/cmake/gmxManageNvccConfig.cmake
@@ -98,13 +98,13 @@ if (GMX_CUDA_TARGET_SM OR GMX_CUDA_TARGET_COMPUTE)
 else()
     # Set the CUDA GPU architectures to compile for:
     # - with CUDA >=5.0 <6.5:   CC <=3.5 is supported
-    #     => compile sm_20, sm_30, sm_35 SASS, and compute_35 PTX
+    #     => compile sm_30, sm_35 SASS, and compute_35 PTX
     # - with CUDA ==6.5:        CC <=3.7 and 5.0 are supported
-    #     => compile sm_20, sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
+    #     => compile sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
     # - with CUDA >=7.0         CC 5.2 is supported (5.3, Tegra X1 we don't generate code for)
-    #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
+    #     => compile sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
     # - with CUDA >=8.0         CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
-    #     => compile sm_20, sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
+    #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
     # - with CUDA >=9.0         CC 7.0 is supported and CC 2.0 is no longer supported
     #     => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70 SASS, and compute_70 PTX
     #
@@ -113,9 +113,6 @@ else()
     #   equally fast as compiling with sm_5.2 anyway.
 
     # First add flags that trigger SASS (binary) code generation for physical arch
-    if(CUDA_VERSION VERSION_LESS "9.00") # < 9.0
-        list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_20,code=sm_20")
-    endif()
     list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_30,code=sm_30")
     list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=sm_35")
 
@@ -162,10 +159,6 @@ endif()
 # assemble the CUDA flags
 list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CUDA_NVCC_GENCODE_FLAGS}")
 list(APPEND GMX_CUDA_NVCC_FLAGS "-use_fast_math")
-if (CUDA_VERSION VERSION_EQUAL "8.0")
-    # requesting sm_20 triggers deprecation messages with nvcc 8.0 which we better avoid
-    list(APPEND GMX_CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-endif()
 
 # assemble the CUDA host compiler flags
 list(APPEND GMX_CUDA_NVCC_FLAGS "${CUDA_HOST_COMPILER_OPTIONS}")
diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst
index 04384c86f8..777aa1a251 100644
--- a/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@ -203,8 +203,7 @@ least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
 required. You are strongly recommended to
 get the latest CUDA version and driver that supports your hardware, but
 beware of possible performance regressions in newer CUDA versions on
-older hardware. Note that compute capability 2.0 (Fermi)
-devices are no longer supported from CUDA 9.0 and later.
+older hardware.
 While some CUDA compilers (nvcc) might not
 officially support recent versions of gcc as the back-end compiler, we
 still recommend that you at least use a gcc version recent enough to
diff --git a/docs/release-notes/removed-functionality.rst b/docs/release-notes/removed-functionality.rst
index d45130e03b..3d587e001b 100644
--- a/docs/release-notes/removed-functionality.rst
+++ b/docs/release-notes/removed-functionality.rst
@@ -10,6 +10,12 @@ useful when root permissions were available to the user. It may become less usef
 as GROMACS evolves, complicated the GROMACS code, and wasn't regularly tested or maintained.
 It might return if some of these conditions change.
 
+Support for CUDA compute capability 2.x removed
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+The Fermi-era GPUs (cira 2010) are no longer in widespread use, are
+not tested in Jenkins, complicated the code, and are no longer
+supported.
+
 Contrib directory removed
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 This code had not been maintained in years, so likely didn't work, and
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst
index f56508b787..61ef25443e 100644
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -254,7 +254,7 @@ Performance and Run Control
         target minimum number pair-lists in order to improve multi-processor load-balance for better
         performance with small simulation systems. Must be set to a non-negative integer,
         the 0 value disables list splitting.
-        The default value is optimized for supported GPUs (NVIDIA Fermi to Maxwell),
+        The default value is optimized for supported GPUs
         therefore changing it is not necessary for normal usage, but it can be useful on future architectures.
 
 ``GMX_NBLISTCG``
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu
index bb3577322d..dfcce64f72 100644
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -90,7 +90,6 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
                                                    float                &fy,
                                                    float                &fz)
 {
-#if (GMX_PTX_ARCH >= 300)
     if (!(order & (order - 1))) // Only for orders of power of 2
     {
         const unsigned int activeMask = c_fullWarpMask;
@@ -136,7 +135,6 @@ __device__ __forceinline__ void reduce_atom_forces(float3 * __restrict__ sm_forc
         }
     }
     else
-#endif
     {
         // We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
         // which are stored separately (first 2 dimensions only)
@@ -248,7 +246,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     const int         atomIndexGlobal   = atomIndexOffset + atomIndexLocal;
 
     /* Early return for fully empty blocks at the end
-     * (should only happen on Fermi or billions of input atoms)
+     * (should only happen for billions of input atoms)
      */
     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
     {
diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h
index 57e24756e5..50accc397e 100644
--- a/src/gromacs/ewald/pme-gpu-constants.h
+++ b/src/gromacs/ewald/pme-gpu-constants.h
@@ -173,8 +173,8 @@ constexpr int c_solveMaxThreadsPerBlock = (c_solveMaxWarpsPerBlock * warp_size);
 
 //! Gathering max block size in threads
 constexpr int c_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * warp_size;
-//! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
-constexpr int c_gatherMinBlocksPerMP = (GMX_PTX_ARCH < 300) ? GMX_CUDA_MAX_BLOCKS_PER_MP : (GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock);
+//! Gathering min blocks per CUDA multiprocessor
+constexpr int c_gatherMinBlocksPerMP = GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock;
 
 #endif // GMX_GPU == GMX_GPU_CUDA
 
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index 15630727e8..d90ee204ad 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -387,14 +387,12 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
                          kernelParamsPtr->fractShiftsTableTexture,
                          pmeGpu->common->fsh.data(),
-                         newFractShiftsSize,
-                         pmeGpu->deviceInfo);
+                         newFractShiftsSize);
 
     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
                          kernelParamsPtr->gridlineIndicesTableTexture,
                          pmeGpu->common->nn.data(),
-                         newFractShiftsSize,
-                         pmeGpu->deviceInfo);
+                         newFractShiftsSize);
 #elif GMX_GPU == GMX_GPU_OPENCL
     // No dedicated texture routines....
     allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize, pmeGpu->archSpecific->context);
@@ -413,11 +411,9 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
     auto *kernelParamsPtr = pmeGpu->kernelParams.get();
 #if GMX_GPU == GMX_GPU_CUDA
     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
-                            kernelParamsPtr->fractShiftsTableTexture,
-                            pmeGpu->deviceInfo);
+                            kernelParamsPtr->fractShiftsTableTexture);
     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
-                            kernelParamsPtr->gridlineIndicesTableTexture,
-                            pmeGpu->deviceInfo);
+                            kernelParamsPtr->gridlineIndicesTableTexture);
 #elif GMX_GPU == GMX_GPU_OPENCL
     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
     freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu
index b163ddb3a8..bac9c9c6b6 100644
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -240,7 +240,6 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
     /* Optional energy/virial reduction */
     if (computeEnergyAndVirial)
     {
-#if (GMX_PTX_ARCH >= 300)
         /* A tricky shuffle reduction inspired by reduce_force_j_warp_shfl.
          * The idea is to reduce 7 energy/virial components into a single variable (aligned by 8).
          * We will reduce everything into virxx.
@@ -337,63 +336,6 @@ __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParam
                 atomicAdd(gm_virialAndEnergy + componentIndex, output);
             }
         }
-#else
-        /* Shared memory reduction with atomics for compute capability < 3.0.
-         * Each component is first reduced into warp_size positions in the shared memory;
-         * Then first c_virialAndEnergyCount warps reduce everything further and add to the global memory.
-         * This can likely be improved, but is anyway faster than the previous straightforward reduction,
-         * which was using too much shared memory (for storing all 7 floats on each thread).
-         * [48KB (shared mem limit per SM on CC2.x) / sizeof(float) (4) / c_solveMaxThreadsPerBlock (256) / c_virialAndEnergyCount (7) ==
-         * 6 blocks per SM instead of 16 which is maximum on CC2.x].
-         */
-
-        const int        lane      = threadLocalId & (warp_size - 1);
-        const int        warpIndex = threadLocalId / warp_size;
-        const bool       firstWarp = (warpIndex == 0);
-        __shared__ float sm_virialAndEnergy[c_virialAndEnergyCount * warp_size];
-        if (firstWarp)
-        {
-            sm_virialAndEnergy[0 * warp_size + lane] = virxx;
-            sm_virialAndEnergy[1 * warp_size + lane] = viryy;
-            sm_virialAndEnergy[2 * warp_size + lane] = virzz;
-            sm_virialAndEnergy[3 * warp_size + lane] = virxy;
-            sm_virialAndEnergy[4 * warp_size + lane] = virxz;
-            sm_virialAndEnergy[5 * warp_size + lane] = viryz;
-            sm_virialAndEnergy[6 * warp_size + lane] = energy;
-        }
-        __syncthreads();
-        if (!firstWarp)
-        {
-            atomicAdd(sm_virialAndEnergy + 0 * warp_size + lane, virxx);
-            atomicAdd(sm_virialAndEnergy + 1 * warp_size + lane, viryy);
-            atomicAdd(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
-            atomicAdd(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
-            atomicAdd(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
-            atomicAdd(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
-            atomicAdd(sm_virialAndEnergy + 6 * warp_size + lane, energy);
-        }
-        __syncthreads();
-
-        GMX_UNUSED_VALUE(activeWarps);
-        assert(activeWarps >= c_virialAndEnergyCount); // we need to cover all components, or have multiple iterations otherwise
-        const int componentIndex = warpIndex;
-        if (componentIndex < c_virialAndEnergyCount)
-        {
-            const int targetIndex = threadLocalId;
-#pragma unroll
-            for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
-            {
-                if (lane < reductionStride)
-                {
-                    sm_virialAndEnergy[targetIndex] += sm_virialAndEnergy[targetIndex + reductionStride];
-                }
-            }
-            if (lane == 0)
-            {
-                atomicAdd(gm_virialAndEnergy + componentIndex, sm_virialAndEnergy[targetIndex]);
-            }
-        }
-#endif
     }
 }
 
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu
index 2275dba5af..e45945551b 100644
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -429,7 +429,7 @@ __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernel
     const int        atomIndexOffset = blockIndex * atomsPerBlock;
 
     /* Early return for fully empty blocks at the end
-     * (should only happen on Fermi or billions of input atoms)
+     * (should only happen for billions of input atoms)
      */
     if (atomIndexOffset >= kernelParams.atoms.nAtoms)
     {
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
index 8ae0a20a08..79fa93353f 100644
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -140,14 +140,14 @@ T gmx_shfl_down_sync(const unsigned int activeMask,
 
 /*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
  *
- *  Only texture objects supported, disable textures for <= CC 2.0 (but not in host code).
+ *  Only texture objects supported.
  *  Disable texture support missing in clang (all versions up to <=5.0-dev as of writing).
  *
  *  This option will not influence functionality. All features using textures ought
  *  to have fallback for texture-less reads (direct/LDG loads), all new code needs
  *  to provide fallback code.
  */
-#if defined(GMX_DISABLE_CUDA_TEXTURES) || (GMX_PTX_ARCH > 0 && GMX_PTX_ARCH < 300) || (defined(__clang__) && defined(__CUDA__))
+#if defined(GMX_DISABLE_CUDA_TEXTURES) || (defined(__clang__) && defined(__CUDA__))
 #define DISABLE_CUDA_TEXTURES 1
 #else
 #define DISABLE_CUDA_TEXTURES 0
@@ -163,10 +163,7 @@ static const bool c_disableCudaTextures = DISABLE_CUDA_TEXTURES;
  *
  */
 #if GMX_PTX_ARCH > 0
-    #if   GMX_PTX_ARCH <= 210  // CC 2.x
-        #define GMX_CUDA_MAX_BLOCKS_PER_MP   8
-        #define GMX_CUDA_MAX_THREADS_PER_MP  1536
-    #elif GMX_PTX_ARCH <= 370  // CC 3.x
+    #if GMX_PTX_ARCH <= 370  // CC 3.x
         #define GMX_CUDA_MAX_BLOCKS_PER_MP   16
         #define GMX_CUDA_MAX_THREADS_PER_MP  2048
     #else // CC 5.x, 6.x
diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu
index 6731fef53d..fda0e9bb90 100644
--- a/src/gromacs/gpu_utils/cudautils.cu
+++ b/src/gromacs/gpu_utils/cudautils.cu
@@ -134,18 +134,6 @@ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s
     return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
 }
 
-/*! \brief Return whether texture objects are used on this device.
- *
- * \param[in]   pointer to the GPU device info structure to inspect for texture objects support
- * \return      true if texture objects are used on this device
- */
-static inline bool use_texobj(const gmx_device_info_t *dev_info)
-{
-    assert(!c_disableCudaTextures);
-    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-    return (dev_info->prop.major >= 3);
-}
-
 /*! \brief Set up texture object for an array of type T.
  *
  * Set up texture object for an array of type T and bind it to the device memory
@@ -183,8 +171,7 @@ template <typename T>
 void initParamLookupTable(T                        * &d_ptr,
                           cudaTextureObject_t        &texObj,
                           const T                    *h_ptr,
-                          int                         numElem,
-                          const gmx_device_info_t    *devInfo)
+                          int                         numElem)
 {
     const size_t sizeInBytes = numElem * sizeof(*d_ptr);
     cudaError_t  stat        = cudaMalloc((void **)&d_ptr, sizeInBytes);
@@ -193,24 +180,17 @@ void initParamLookupTable(T                        * &d_ptr,
 
     if (!c_disableCudaTextures)
     {
-        if (use_texobj(devInfo))
-        {
-            setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
-        }
+        setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
     }
 }
 
 template <typename T>
 void destroyParamLookupTable(T                       *d_ptr,
-                             cudaTextureObject_t      texObj,
-                             const gmx_device_info_t *devInfo)
+                             cudaTextureObject_t      texObj)
 {
     if (!c_disableCudaTextures)
     {
-        if (use_texobj(devInfo))
-        {
-            CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
-        }
+        CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
     }
     CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
 }
@@ -219,7 +199,7 @@ void destroyParamLookupTable(T                       *d_ptr,
  * One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
  * looks reasonable, when instantiating the templates for new types - just in case.
  */
-template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
-template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
-template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
-template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);
+template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int);
+template void destroyParamLookupTable<float>(float *, cudaTextureObject_t);
+template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int);
+template void destroyParamLookupTable<int>(int *, cudaTextureObject_t);
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 2fb14ba46a..df319ae051 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -179,14 +179,12 @@ int cu_copy_H2D_async(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/, cud
  * \param[out] texObj    texture object to be initialized
  * \param[in]  h_ptr     pointer to the host memory to be uploaded to the device
  * \param[in]  numElem   number of elements in the h_ptr
- * \param[in]  devInfo   pointer to the info struct of the device in use
  */
 template <typename T>
 void initParamLookupTable(T                        * &d_ptr,
                           cudaTextureObject_t        &texObj,
                           const T                    *h_ptr,
-                          int                         numElem,
-                          const gmx_device_info_t    *devInfo);
+                          int                         numElem);
 
 // Add extern declarations so each translation unit understands that
 // there will be a definition provided.
@@ -200,12 +198,10 @@ extern template void initParamLookupTable<float>(float * &, cudaTextureObject_t
  * \tparam[in] T         Raw data type
  * \param[in]  d_ptr     Device pointer to the memory to be deallocated
  * \param[in]  texObj    Texture object to be deinitialized
- * \param[in]  devInfo   Pointer to the info struct of the device in use
  */
 template <typename T>
 void destroyParamLookupTable(T                       *d_ptr,
-                             cudaTextureObject_t      texObj,
-                             const gmx_device_info_t *devInfo);
+                             cudaTextureObject_t      texObj);
 
 // Add extern declarations so each translation unit understands that
 // there will be a definition provided.
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index 04afee54db..307cdcd99d 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -87,7 +87,8 @@ static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
         gmx_fatal(FARGS,
                   "The %s binary does not include support for the CUDA architecture "
                   "of the selected GPU (device ID #%d, compute capability %d.%d). "
-                  "By default, GROMACS supports all common architectures, so your GPU "
+                  "By default, GROMACS supports all architectures of compute "
+                  "capability >= 3.0, so your GPU "
                   "might be rare, or some architectures were disabled in the build. "
                   "Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
                   "GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
@@ -96,16 +97,6 @@ static void checkCompiledTargetCompatibility(const gmx_device_info_t *devInfo)
     }
 
     CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
-
-    if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
-    {
-        gmx_fatal(FARGS,
-                  "The GPU device code was compiled at runtime from 2.0 source which is "
-                  "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
-                  "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
-                  devInfo->id,
-                  devInfo->prop.major, devInfo->prop.minor);
-    }
 }
 
 bool isHostMemoryPinned(const void *h_ptr)
@@ -311,7 +302,7 @@ gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
  */
 static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 {
-    return (dev_prop->major >= 2);
+    return (dev_prop->major >= 3);
 }
 
 /*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 7fc1f6f1ac..12d0deb9b2 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -252,25 +252,19 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
     shmem  = c_numClPerSupercl * c_clSize * sizeof(float4);
     /* cj in shared memory, for each warp separately */
     shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
-    if (dinfo->prop.major >= 3)
+
+    if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
+        nbp->vdwtype == evdwCuCUTCOMBLB)
     {
-        if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
-            nbp->vdwtype == evdwCuCUTCOMBLB)
-        {
-            /* i-atom LJ combination parameters in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
-        }
-        else
-        {
-            /* i-atom types in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * sizeof(int);
-        }
+        /* i-atom LJ combination parameters in shared memory */
+        shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
     }
-    if (dinfo->prop.major < 3)
+    else
     {
-        /* force reduction buffers in shared memory */
-        shmem += c_clSize * c_clSize * 3 * sizeof(float);
+        /* i-atom types in shared memory */
+        shmem += c_numClPerSupercl * c_clSize * sizeof(int);
     }
+
     return shmem;
 }
 
@@ -666,7 +660,7 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     }
 }
 
-void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig()
 {
     cudaError_t stat;
 
@@ -674,23 +668,11 @@ void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
     {
         for (int j = 0; j < evdwCuNR; j++)
         {
-            if (devinfo->prop.major >= 3)
-            {
-                /* Default kernel on sm 3.x and later 32/32 kB Shared/L1 */
-                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
-                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
-                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
-            }
-            else
-            {
-                /* On Fermi prefer L1 gives 2% higher performance */
-                /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
-                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
-                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
-                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
-            }
+            /* Default kernel 32/32 kB Shared/L1 */
+            cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+            cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
+            cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+            stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
             CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
         }
     }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h
index c61fe210dc..e2badb3283 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h
@@ -39,9 +39,7 @@
 #ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
 #define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
 
-#include "nbnxn_cuda_types.h"
-
 //! Set up the cache configuration for the non-bonded kernels.
-void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo);
+void nbnxn_cuda_set_cacheconfig();
 
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 58e6d343e0..12187ce6fd 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -66,7 +66,7 @@
 #include "nbnxn_cuda.h"
 #include "nbnxn_cuda_types.h"
 
-/* This is a heuristically determined parameter for the Fermi, Kepler
+/* This is a heuristically determined parameter for the Kepler
  * and Maxwell architectures for the minimum size of ci lists by multiplying
  * this constant with the # of multiprocessors on the current device.
  * Since the maximum number of blocks per multiprocessor is 16, the ideal
@@ -80,8 +80,7 @@ static unsigned int gpu_min_ci_balanced_factor = 44;
 static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
 
 /* Fw. decl, */
-static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
-                                          const gmx_device_info_t *dev_info);
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam);
 
 /*! \brief Return whether combination rules are used.
  *
@@ -101,17 +100,16 @@ static inline bool useLjCombRule(const cu_nbparam_t  *nbparam)
     it just re-uploads the table.
  */
 static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
-                                           cu_nbparam_t              *nbp,
-                                           const gmx_device_info_t   *dev_info)
+                                           cu_nbparam_t              *nbp)
 {
     if (nbp->coulomb_tab != nullptr)
     {
-        nbnxn_cuda_free_nbparam_table(nbp, dev_info);
+        nbnxn_cuda_free_nbparam_table(nbp);
     }
 
     nbp->coulomb_tab_scale = ic->tabq_scale;
     initParamLookupTable(nbp->coulomb_tab, nbp->coulomb_tab_texobj,
-                         ic->tabq_coul_F, ic->tabq_size, dev_info);
+                         ic->tabq_coul_F, ic->tabq_size);
 }
 
 
@@ -146,8 +144,7 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
 
 /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
     earlier GPUs, single or twin cut-off. */
-static int pick_ewald_kernel_type(bool                     bTwinCut,
-                                  const gmx_device_info_t *dev_info)
+static int pick_ewald_kernel_type(bool                     bTwinCut)
 {
     bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
     int  kernel_type;
@@ -163,17 +160,16 @@ static int pick_ewald_kernel_type(bool                     bTwinCut,
                    "requested through environment variables.");
     }
 
-    /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
-    if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    /* By default use analytical Ewald. */
+    bUseAnalyticalEwald = true;
+    if (bForceAnalyticalEwald)
     {
-        bUseAnalyticalEwald = true;
-
         if (debug)
         {
             fprintf(debug, "Using analytical Ewald CUDA kernels\n");
         }
     }
-    else
+    else if (bForceTabulatedEwald)
     {
         bUseAnalyticalEwald = false;
 
@@ -226,8 +222,7 @@ static void set_cutoff_parameters(cu_nbparam_t              *nbp,
 static void init_nbparam(cu_nbparam_t              *nbp,
                          const interaction_const_t *ic,
                          const NbnxnListParameters *listParams,
-                         const nbnxn_atomdata_t    *nbat,
-                         const gmx_device_info_t   *dev_info)
+                         const nbnxn_atomdata_t    *nbat)
 {
     int         ntypes;
 
@@ -304,7 +299,7 @@ static void init_nbparam(cu_nbparam_t              *nbp,
     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
     {
         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
-        nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
+        nbp->eeltype = pick_ewald_kernel_type(false);
     }
     else
     {
@@ -316,21 +311,21 @@ static void init_nbparam(cu_nbparam_t              *nbp,
     nbp->coulomb_tab = nullptr;
     if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        init_ewald_coulomb_force_table(ic, nbp, dev_info);
+        init_ewald_coulomb_force_table(ic, nbp);
     }
 
     /* set up LJ parameter lookup table */
     if (!useLjCombRule(nbp))
     {
         initParamLookupTable(nbp->nbfp, nbp->nbfp_texobj,
-                             nbat->nbfp, 2*ntypes*ntypes, dev_info);
+                             nbat->nbfp, 2*ntypes*ntypes);
     }
 
     /* set up LJ-PME parameter lookup table */
     if (ic->vdwtype == evdwPME)
     {
         initParamLookupTable(nbp->nbfp_comb, nbp->nbfp_comb_texobj,
-                             nbat->nbfp_comb, 2*ntypes, dev_info);
+                             nbat->nbfp_comb, 2*ntypes);
     }
 }
 
@@ -349,10 +344,9 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
 
     set_cutoff_parameters(nbp, ic, listParams);
 
-    nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
-                                                 nb->dev_info);
+    nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
 
-    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+    init_ewald_coulomb_force_table(ic, nb->nbparam);
 }
 
 /*! Initializes the pair list data structure. */
@@ -421,7 +415,7 @@ static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
                                   const nbnxn_atomdata_t         *nbat)
 {
     init_atomdata_first(nb->atdat, nbat->ntype);
-    init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_info);
+    init_nbparam(nb->nbparam, ic, listParams, nbat);
 
     /* clear energy and shift force outputs */
     nbnxn_cuda_clear_e_fshift(nb);
@@ -508,7 +502,7 @@ void nbnxn_gpu_init(gmx_nbnxn_cuda_t         **p_nb,
 
     /* set the kernel type for the current GPU */
     /* pick L1 cache configuration */
-    nbnxn_cuda_set_cacheconfig(nb->dev_info);
+    nbnxn_cuda_set_cacheconfig();
 
     nbnxn_cuda_init_const(nb, ic, listParams, nbat);
 
@@ -713,13 +707,11 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
     }
 }
 
-static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
-                                          const gmx_device_info_t *dev_info)
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam)
 {
     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj,
-                                dev_info);
+        destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
     }
 }
 
@@ -737,7 +729,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
     atdat       = nb->atdat;
     nbparam     = nb->nbparam;
 
-    nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
+    nbnxn_cuda_free_nbparam_table(nbparam);
 
     stat = cudaEventDestroy(nb->nonlocal_done);
     CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
@@ -757,15 +749,13 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 
     if (!useLjCombRule(nb->nbparam))
     {
-        destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj,
-                                nb->dev_info);
+        destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj);
 
     }
 
     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
     {
-        destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj,
-                                nb->dev_info);
+        destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
     }
 
     stat = cudaFree(atdat->shift_vec);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index bf9cdf0d2e..a1c85b7f53 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -54,10 +54,6 @@
  * code that is in double precision.
  */
 
-#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
-#endif
-
 #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
 #define EL_EWALD_ANY
@@ -97,7 +93,7 @@
  * NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
  * warp-pairs per block.
  *
- * - On CC 2.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
+ * - On CC 3.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
  * blocks/multiproc, is the fastest even though this setup gives low occupancy
  * (except on 6.0).
  * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
@@ -142,9 +138,6 @@
 #define THREADS_PER_BLOCK   (c_clSize*c_clSize*NTHREAD_Z)
 
 #if GMX_PTX_ARCH >= 350
-#if (GMX_PTX_ARCH <= 210) && (NTHREAD_Z > 1)
-    #error NTHREAD_Z > 1 will give incorrect results on CC 2.x
-#endif
 /**@}*/
 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
 #else
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
deleted file mode 100644
index 4fa0c8833d..0000000000
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
+++ /dev/null
@@ -1,588 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- *  \brief
- *  CUDA non-bonded kernel used through preprocessor-based code generation
- *  of multiple kernel flavors for CC 2.x, see nbnxn_cuda_kernels.cuh.
- *
- *  NOTE: No include fence as it is meant to be included multiple times.
- *
- *  \author SzilÃ¡rd PÃ¡ll <pall.szilard@gmail.com>
- *  \author Berk Hess <hess@kth.se>
- *  \ingroup module_mdlib
- */
-
-#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
-#include "gromacs/math/utilities.h"
-#include "gromacs/pbcutil/ishift.h"
-/* Note that floating-point constants in CUDA code should be suffixed
- * with f (e.g. 0.5f), to stop the compiler producing intermediate
- * code that is in double precision.
- */
-
-#if GMX_PTX_ARCH >= 300
-#error "nbnxn_cuda_kernel_fermi.cuh included with GMX_PTX_ARCH >= 300"
-#endif
-
-#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
-/* Note: convenience macro, needs to be undef-ed at the end of the file. */
-#define EL_EWALD_ANY
-#endif
-
-#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
-/* Macro to control the calculation of exclusion forces in the kernel
- * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
- * energy terms.
- *
- * Note: convenience macro, needs to be undef-ed at the end of the file.
- */
-#define EXCLUSION_FORCES
-#endif
-
-#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
-/* Note: convenience macro, needs to be undef-ed at the end of the file. */
-#define LJ_EWALD
-#endif
-
-#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
-#define LJ_COMB
-#endif
-
-/*
-   Kernel launch parameters:
-    - #blocks   = #pair lists, blockId = pair list Id
-    - #threads  = c_clSize^2
-    - shmem     = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
-
-    Each thread calculates an i force-component taking one pair of i-j atoms.
- */
-
-/**@{*/
-/*! \brief Definition of kernel launch configuration parameters for CC 2.x.
- */
-
-/* Kernel launch bounds, 16 blocks/multiprocessor can be kept in flight. */
-#define THREADS_PER_BLOCK   (c_clSize*c_clSize)
-
-__launch_bounds__(THREADS_PER_BLOCK)
-#ifdef PRUNE_NBL
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_cuda)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_cuda)
-#endif /* CALC_ENERGIES */
-#else
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_cuda)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
-#endif /* CALC_ENERGIES */
-#endif /* PRUNE_NBL */
-(const cu_atomdata_t atdat,
- const cu_nbparam_t nbparam,
- const cu_plist_t plist,
- bool bCalcFshift)
-#ifdef FUNCTION_DECLARATION_ONLY
-;     /* Only do function declaration, omit the function body. */
-#else
-{
-    /* convenience variables */
-    const nbnxn_sci_t *pl_sci       = plist.sci;
-#ifndef PRUNE_NBL
-    const
-#endif
-    nbnxn_cj4_t        *pl_cj4      = plist.cj4;
-    const nbnxn_excl_t *excl        = plist.excl;
-#ifndef LJ_COMB
-    const int          *atom_types  = atdat.atom_types;
-    int                 ntypes      = atdat.ntypes;
-#else
-    const float2       *lj_comb     = atdat.lj_comb;
-    float2              ljcp_i, ljcp_j;
-#endif
-    const float4       *xq          = atdat.xq;
-    float3             *f           = atdat.f;
-    const float3       *shift_vec   = atdat.shift_vec;
-    float               rcoulomb_sq = nbparam.rcoulomb_sq;
-#ifdef VDW_CUTOFF_CHECK
-    float               rvdw_sq     = nbparam.rvdw_sq;
-    float               vdw_in_range;
-#endif
-#ifdef LJ_EWALD
-    float               lje_coeff2, lje_coeff6_6;
-#endif
-#ifdef EL_RF
-    float two_k_rf              = nbparam.two_k_rf;
-#endif
-#ifdef EL_EWALD_ANA
-    float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
-    float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
-#endif
-#ifdef PRUNE_NBL
-    float rlist_sq              = nbparam.rlistOuter_sq;
-#endif
-
-#ifdef CALC_ENERGIES
-#ifdef EL_EWALD_ANY
-    float  beta        = nbparam.ewald_beta;
-    float  ewald_shift = nbparam.sh_ewald;
-#else
-    float  c_rf        = nbparam.c_rf;
-#endif /* EL_EWALD_ANY */
-    float *e_lj        = atdat.e_lj;
-    float *e_el        = atdat.e_el;
-#endif /* CALC_ENERGIES */
-
-    /* thread/block/warp id-s */
-    unsigned int tidxi  = threadIdx.x;
-    unsigned int tidxj  = threadIdx.y;
-    unsigned int tidx   = threadIdx.y * blockDim.x + threadIdx.x;
-    unsigned int bidx   = blockIdx.x;
-    unsigned int widx   = tidx / warp_size; /* warp index */
-
-    int          sci, ci, cj,
-                 ai, aj,
-                 cij4_start, cij4_end;
-#ifndef LJ_COMB
-    int          typei, typej;
-#endif
-    int          i, jm, j4, wexcl_idx;
-    float        qi, qj_f,
-                 r2, inv_r, inv_r2;
-#if !defined LJ_COMB_LB || defined CALC_ENERGIES
-    float        inv_r6, c6, c12;
-#endif
-#ifdef LJ_COMB_LB
-    float        sigma, epsilon;
-#endif
-    float        int_bit,
-                 F_invr;
-#ifdef CALC_ENERGIES
-    float        E_lj, E_el;
-#endif
-#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
-    float        E_lj_p;
-#endif
-    unsigned int wexcl, imask, mask_ji;
-    float4       xqbuf;
-    float3       xi, xj, rv, f_ij, fcj_buf;
-    float3       fci_buf[c_numClPerSupercl]; /* i force buffer */
-    nbnxn_sci_t  nb_sci;
-
-    /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-    const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
-
-    /*********************************************************************
-     * Set up shared memory pointers.
-     * sm_nextSlotPtr should always be updated to point to the "next slot",
-     * that is past the last point where data has been stored.
-     */
-    extern __shared__  char sm_dynamicShmem[];
-    char                   *sm_nextSlotPtr = sm_dynamicShmem;
-    static_assert(sizeof(char) == 1, "The shared memory offset calculation assumes that char is 1 byte");
-
-    /* shmem buffer for i x+q pre-loading */
-    float4 *xqib    = (float4 *)sm_nextSlotPtr;
-    sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
-
-    /* shmem buffer for cj, for each warp separately */
-    int *cjs        = (int *)(sm_nextSlotPtr);
-    sm_nextSlotPtr += (c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
-
-    /* shmem j force buffer */
-    float *f_buf    = (float *)(sm_nextSlotPtr);
-    sm_nextSlotPtr += (c_clSize * c_clSize * 3*sizeof(*f_buf));
-    /*********************************************************************/
-
-    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
-    sci         = nb_sci.sci;           /* super-cluster */
-    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
-    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
-
-    {
-        /* Pre-load i-atom x and q into shared memory */
-        ci = sci * c_numClPerSupercl + tidxj;
-        ai = ci * c_clSize + tidxi;
-
-        xqbuf    = xq[ai] + shift_vec[nb_sci.shift];
-        xqbuf.w *= nbparam.epsfac;
-        xqib[tidxj * c_clSize + tidxi] = xqbuf;
-    }
-    __syncthreads();
-
-    for (i = 0; i < c_numClPerSupercl; i++)
-    {
-        fci_buf[i] = make_float3(0.0f);
-    }
-
-#ifdef LJ_EWALD
-    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
-    lje_coeff2   = nbparam.ewaldcoeff_lj*nbparam.ewaldcoeff_lj;
-    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*c_oneSixth;
-#endif
-
-
-#ifdef CALC_ENERGIES
-    E_lj = 0.0f;
-    E_el = 0.0f;
-
-#ifdef EXCLUSION_FORCES /* Ewald or RF */
-    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*c_numClPerSupercl)
-    {
-        /* we have the diagonal: add the charge and LJ self interaction energy term */
-        for (i = 0; i < c_numClPerSupercl; i++)
-        {
-#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-            qi    = xqib[i * c_clSize + tidxi].w;
-            E_el += qi*qi;
-#endif
-
-#ifdef LJ_EWALD
-            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
-#endif
-        }
-
-        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
-#ifdef LJ_EWALD
-        E_lj /= c_clSize;
-        E_lj *= 0.5f*c_oneSixth*lje_coeff6_6;
-#endif
-
-#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        /* Correct for epsfac^2 due to adding qi^2 */
-        E_el /= nbparam.epsfac*c_clSize;
-#if defined EL_RF || defined EL_CUTOFF
-        E_el *= -0.5f*c_rf;
-#else
-        E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
-#endif
-#endif                                  /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
-    }
-#endif                                  /* EXCLUSION_FORCES */
-
-#endif                                  /* CALC_ENERGIES */
-
-#ifdef EXCLUSION_FORCES
-    const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
-#endif
-
-    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
-    for (j4 = cij4_start; j4 < cij4_end; j4++)
-    {
-        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
-        imask       = pl_cj4[j4].imei[widx].imask;
-        wexcl       = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
-
-#ifndef PRUNE_NBL
-        if (imask)
-#endif
-        {
-            /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
-            {
-                cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
-            }
-
-            /* Unrolling this loop with pruning leads to register spilling;
-               Tested with up to nvcc 7.5 */
-#if !defined PRUNE_NBL
-#pragma unroll 4
-#endif
-            for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
-            {
-                if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
-                {
-                    mask_ji = (1U << (jm * c_numClPerSupercl));
-
-                    cj      = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize];
-                    aj      = cj * c_clSize + tidxj;
-
-                    /* load j atom data */
-                    xqbuf   = xq[aj];
-                    xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-                    qj_f    = xqbuf.w;
-#ifndef LJ_COMB
-                    typej   = atom_types[aj];
-#else
-                    ljcp_j  = lj_comb[aj];
-#endif
-
-                    fcj_buf = make_float3(0.0f);
-
-#if !defined PRUNE_NBL
-#pragma unroll 8
-#endif
-                    for (i = 0; i < c_numClPerSupercl; i++)
-                    {
-                        if (imask & mask_ji)
-                        {
-                            ci      = sci * c_numClPerSupercl + i; /* i cluster index */
-                            ai      = ci * c_clSize + tidxi;       /* i atom index */
-
-                            /* all threads load an atom from i cluster ci into shmem! */
-                            xqbuf   = xqib[i * c_clSize + tidxi];
-                            xi      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-
-                            /* distance between i and j atoms */
-                            rv      = xi - xj;
-                            r2      = norm2(rv);
-
-#ifdef PRUNE_NBL
-                            /* If _none_ of the atoms pairs are in cutoff range,
-                               the bit corresponding to the current
-                               cluster-pair in imask gets set to 0. */
-                            if (!__any(r2 < rlist_sq))
-                            {
-                                imask &= ~mask_ji;
-                            }
-#endif
-
-                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
-
-                            /* cutoff & exclusion check */
-#ifdef EXCLUSION_FORCES
-                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
-#else
-                            if ((r2 < rcoulomb_sq) * int_bit)
-#endif
-                            {
-                                /* load the rest of the i-atom parameters */
-                                qi      = xqbuf.w;
-
-#ifndef LJ_COMB
-                                /* LJ 6*C6 and 12*C12 */
-                                typei   = atom_types[ai];
-                                fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
-#else
-                                ljcp_i  = lj_comb[ai];
-#ifdef LJ_COMB_GEOM
-                                c6      = ljcp_i.x * ljcp_j.x;
-                                c12     = ljcp_i.y * ljcp_j.y;
-#else
-                                /* LJ 2^(1/6)*sigma and 12*epsilon */
-                                sigma   = ljcp_i.x + ljcp_j.x;
-                                epsilon = ljcp_i.y * ljcp_j.y;
-#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
-                                convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
-#endif
-#endif                          /* LJ_COMB_GEOM */
-#endif                          /* LJ_COMB */
-
-                                // Ensure distance do not become so small that r^-12 overflows
-                                r2      = max(r2, NBNXN_MIN_RSQ);
-
-                                inv_r   = rsqrt(r2);
-                                inv_r2  = inv_r * inv_r;
-#if !defined LJ_COMB_LB || defined CALC_ENERGIES
-                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#ifdef EXCLUSION_FORCES
-                                /* We could mask inv_r2, but with Ewald
-                                 * masking both inv_r6 and F_invr is faster */
-                                inv_r6  *= int_bit;
-#endif                          /* EXCLUSION_FORCES */
-
-                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
-#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
-                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*c_oneTwelveth -
-                                                     c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*c_oneSixth);
-#endif
-#else                           /* !LJ_COMB_LB || CALC_ENERGIES */
-                                float sig_r  = sigma*inv_r;
-                                float sig_r2 = sig_r*sig_r;
-                                float sig_r6 = sig_r2*sig_r2*sig_r2;
-#ifdef EXCLUSION_FORCES
-                                sig_r6 *= int_bit;
-#endif                          /* EXCLUSION_FORCES */
-
-                                F_invr  = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
-#endif                          /* !LJ_COMB_LB || CALC_ENERGIES */
-
-#ifdef LJ_FORCE_SWITCH
-#ifdef CALC_ENERGIES
-                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
-#else
-                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
-#endif /* CALC_ENERGIES */
-#endif /* LJ_FORCE_SWITCH */
-
-
-#ifdef LJ_EWALD
-#ifdef LJ_EWALD_COMB_GEOM
-#ifdef CALC_ENERGIES
-                                calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
-#else
-                                calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
-#endif                          /* CALC_ENERGIES */
-#elif defined LJ_EWALD_COMB_LB
-                                calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
-#ifdef CALC_ENERGIES
-                                                               int_bit, &F_invr, &E_lj_p
-#else
-                                                               0, &F_invr, NULL
-#endif /* CALC_ENERGIES */
-                                                               );
-#endif /* LJ_EWALD_COMB_GEOM */
-#endif /* LJ_EWALD */
-
-#ifdef LJ_POT_SWITCH
-#ifdef CALC_ENERGIES
-                                calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
-#else
-                                calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
-#endif /* CALC_ENERGIES */
-#endif /* LJ_POT_SWITCH */
-
-#ifdef VDW_CUTOFF_CHECK
-                                /* Separate VDW cut-off check to enable twin-range cut-offs
-                                 * (rvdw < rcoulomb <= rlist)
-                                 */
-                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr       *= vdw_in_range;
-#ifdef CALC_ENERGIES
-                                E_lj_p       *= vdw_in_range;
-#endif
-#endif                          /* VDW_CUTOFF_CHECK */
-
-#ifdef CALC_ENERGIES
-                                E_lj    += E_lj_p;
-#endif
-
-
-#ifdef EL_CUTOFF
-#ifdef EXCLUSION_FORCES
-                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
-#else
-                                F_invr  += qi * qj_f * inv_r2 * inv_r;
-#endif
-#endif
-#ifdef EL_RF
-                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
-#endif
-#if defined EL_EWALD_ANA
-                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
-#elif defined EL_EWALD_TAB
-                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
-                                                        interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
-#endif                          /* EL_EWALD_ANA/TAB */
-
-#ifdef CALC_ENERGIES
-#ifdef EL_CUTOFF
-                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
-#endif
-#ifdef EL_RF
-                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
-#endif
-#ifdef EL_EWALD_ANY
-                                /* 1.0f - erff is faster than erfcf */
-                                E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif                          /* EL_EWALD_ANY */
-#endif
-                                f_ij    = rv * F_invr;
-
-                                /* accumulate j forces in registers */
-                                fcj_buf -= f_ij;
-
-                                /* accumulate i forces in registers */
-                                fci_buf[i] += f_ij;
-                            }
-                        }
-
-                        /* shift the mask bit by 1 */
-                        mask_ji += mask_ji;
-                    }
-
-                    /* reduce j forces */
-                    /* store j forces in shmem */
-                    f_buf[                   tidx] = fcj_buf.x;
-                    f_buf[    c_fbufStride + tidx] = fcj_buf.y;
-                    f_buf[2 * c_fbufStride + tidx] = fcj_buf.z;
-
-                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
-                }
-            }
-#ifdef PRUNE_NBL
-            /* Update the imask with the new one which does not contain the
-               out of range clusters anymore. */
-            pl_cj4[j4].imei[widx].imask = imask;
-#endif
-        }
-    }
-
-    /* skip central shifts when summing shift forces */
-    if (nb_sci.shift == CENTRAL)
-    {
-        bCalcFshift = false;
-    }
-
-    float fshift_buf = 0.0f;
-
-    /* reduce i forces */
-    for (i = 0; i < c_numClPerSupercl; i++)
-    {
-        ai  = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
-        f_buf[                   tidx] = fci_buf[i].x;
-        f_buf[    c_fbufStride + tidx] = fci_buf[i].y;
-        f_buf[2 * c_fbufStride + tidx] = fci_buf[i].z;
-        __syncthreads();
-        reduce_force_i(f_buf, f,
-                       &fshift_buf, bCalcFshift,
-                       tidxi, tidxj, ai);
-        __syncthreads();
-    }
-
-    /* add up local shift forces into global mem, tidxj indexes x,y,z */
-    if (bCalcFshift && tidxj < 3)
-    {
-        atomicAdd(&(atdat.fshift[nb_sci.shift].x) + tidxj, fshift_buf);
-    }
-
-#ifdef CALC_ENERGIES
-    /* flush the energies to shmem and reduce them */
-    f_buf[               tidx] = E_lj;
-    f_buf[c_fbufStride + tidx] = E_el;
-    reduce_energy_pow2(f_buf + (tidx & warp_size), e_lj, e_el, tidx & ~warp_size);
-#endif
-}
-#endif /* FUNCTION_DECLARATION_ONLY */
-
-#undef THREADS_PER_BLOCK
-
-#undef EL_EWALD_ANY
-#undef EXCLUSION_FORCES
-#undef LJ_EWALD
-
-#undef LJ_COMB
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 3840c48a7e..e9809c8c0d 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -507,9 +507,8 @@ void reduce_force_j_generic(float *f_buf, float3 *fout,
 }
 
 /*! Final j-force reduction; this implementation only with power of two
- *  array sizes and with sm >= 3.0
+ *  array sizes.
  */
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
                               int tidxi, int aidx,
@@ -539,7 +538,6 @@ void reduce_force_j_warp_shfl(float3 f, float3 *fout,
         atomicAdd((&fout[aidx].x) + tidxi, f.x);
     }
 }
-#endif
 
 /*! Final i-force reduction; this generic implementation works with
  *  arbitrary array sizes.
@@ -634,9 +632,8 @@ void reduce_force_i(float *f_buf, float3 *f,
 }
 
 /*! Final i-force reduction; this implementation works only with power of two
- *  array sizes and with sm >= 3.0
+ *  array sizes.
  */
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                               float *fshift_buf, bool bCalcFshift,
@@ -671,7 +668,6 @@ void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
         }
     }
 }
-#endif
 
 /*! Energy reduction; this implementation works only with power of two
  *  array sizes.
@@ -711,9 +707,8 @@ void reduce_energy_pow2(volatile float *buf,
 }
 
 /*! Energy reduction; this implementation works only with power of two
- *  array sizes and with sm >= 3.0
+ *  array sizes.
  */
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_energy_warp_shfl(float E_lj, float E_el,
                              float *e_lj, float *e_el,
@@ -738,6 +733,5 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
         atomicAdd(e_el, E_el);
     }
 }
-#endif /* GMX_PTX_ARCH */
 
 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 65406cc462..16977d73d2 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -52,55 +52,48 @@
  *  \ingroup module_mdlib
  */
 
-/* Use the standard (non-Fermi) kernel in host pass too. */
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
-#define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
-#else
-#define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh"
-#endif
-
 /* Analytical plain cut-off electrostatics kernels
  */
 #define EL_CUTOFF
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
@@ -113,42 +106,42 @@
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
@@ -161,42 +154,42 @@
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
@@ -210,42 +203,42 @@
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
@@ -258,42 +251,42 @@
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
@@ -306,42 +299,42 @@
 
 /* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w geometric combination rules */
 #define LJ_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* cut-off + V shift LJ w LB combination rules */
 #define LJ_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w geometric combination rules */
 #define LJ_EWALD_COMB_GEOM
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_GEOM
 #undef NB_KERNEL_FUNC_NAME
 /* LJ-Ewald w LB combination rules */
 #define LJ_EWALD_COMB_LB
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_EWALD_COMB_LB
 #undef NB_KERNEL_FUNC_NAME
 /* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 /* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 443d656366..ada22a5d42 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -818,17 +818,16 @@ int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
      * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
      *
      */
-    //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
-    if (!bForceTabulatedEwald)
+    /* By default use analytical Ewald. */
+    bUseAnalyticalEwald = true;
+    if (bForceAnalyticalEwald)
     {
-        bUseAnalyticalEwald = true;
-
         if (debug)
         {
             fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
         }
     }
-    else
+    else if (bForceTabulatedEwald)
     {
         bUseAnalyticalEwald = false;
 
-- 
2.22.0