else()
set(REQUIRED_CUDA_VERSION 7.0)
endif()
-set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
+set(REQUIRED_CUDA_COMPUTE_CAPABILITY 3.0)
# OpenCL required version: 1.2 or newer
set(REQUIRED_OPENCL_MIN_VERSION 1.2)
list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_${_target}")
endforeach()
else()
- if(CUDA_VERSION VERSION_LESS "9.00") # < 9.0
- list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_20")
- endif()
list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_30")
list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_35")
# clang 6.0 + CUDA 9.0 seems to have issues generating code for sm_37
https://developer.nvidia.com/cuda-gpus")
endif()
- set(CUDA_NOTFOUND_MESSAGE "mdrun supports native GPU acceleration on NVIDIA hardware with compute capability >= ${REQUIRED_CUDA_COMPUTE_CAPABILITY} (Fermi or later). This requires the NVIDIA CUDA toolkit, which was not found. Its location can be hinted by setting the CUDA_TOOLKIT_ROOT_DIR CMake option (does not work as an environment variable). The typical location would be /usr/local/cuda[-version]. Note that CPU or GPU acceleration can be selected at runtime.
+ set(CUDA_NOTFOUND_MESSAGE "mdrun supports native GPU acceleration on NVIDIA hardware with compute capability >= ${REQUIRED_CUDA_COMPUTE_CAPABILITY} (Kepler or later). This requires the NVIDIA CUDA toolkit, which was not found. Its location can be hinted by setting the CUDA_TOOLKIT_ROOT_DIR CMake option (does not work as an environment variable). The typical location would be /usr/local/cuda[-version]. Note that CPU or GPU acceleration can be selected at runtime.
${_msg}")
unset(_msg)
else()
# Set the CUDA GPU architectures to compile for:
# - with CUDA >=5.0 <6.5: CC <=3.5 is supported
- # => compile sm_20, sm_30, sm_35 SASS, and compute_35 PTX
+ # => compile sm_30, sm_35 SASS, and compute_35 PTX
# - with CUDA ==6.5: CC <=3.7 and 5.0 are supported
- # => compile sm_20, sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
+ # => compile sm_30, sm_35, sm_37 sm_50, SASS, and compute_50 PTX
# - with CUDA >=7.0 CC 5.2 is supported (5.3, Tegra X1 we don't generate code for)
- # => compile sm_20, sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
+ # => compile sm_30, sm_35, sm_37, sm_50, & sm_52 SASS, and compute_52 PTX
# - with CUDA >=8.0 CC 6.0-6.2 is supported (but we know nothing about CC 6.2, so we won't generate code or it)
- # => compile sm_20, sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
+ # => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61 SASS, and compute_60 and compute_61 PTX
# - with CUDA >=9.0 CC 7.0 is supported and CC 2.0 is no longer supported
# => compile sm_30, sm_35, sm_37, sm_50, sm_52, sm_60, sm_61, sm_70 SASS, and compute_70 PTX
#
# equally fast as compiling with sm_5.2 anyway.
# First add flags that trigger SASS (binary) code generation for physical arch
- if(CUDA_VERSION VERSION_LESS "9.00") # < 9.0
- list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_20,code=sm_20")
- endif()
list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_30,code=sm_30")
list (APPEND GMX_CUDA_NVCC_GENCODE_FLAGS "-gencode;arch=compute_35,code=sm_35")
# assemble the CUDA flags
list(APPEND GMX_CUDA_NVCC_FLAGS "${GMX_CUDA_NVCC_GENCODE_FLAGS}")
list(APPEND GMX_CUDA_NVCC_FLAGS "-use_fast_math")
-if (CUDA_VERSION VERSION_EQUAL "8.0")
- # requesting sm_20 triggers deprecation messages with nvcc 8.0 which we better avoid
- list(APPEND GMX_CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-endif()
# assemble the CUDA host compiler flags
list(APPEND GMX_CUDA_NVCC_FLAGS "${CUDA_HOST_COMPILER_OPTIONS}")
required. You are strongly recommended to
get the latest CUDA version and driver that supports your hardware, but
beware of possible performance regressions in newer CUDA versions on
-older hardware. Note that compute capability 2.0 (Fermi)
-devices are no longer supported from CUDA 9.0 and later.
+older hardware.
While some CUDA compilers (nvcc) might not
officially support recent versions of gcc as the back-end compiler, we
still recommend that you at least use a gcc version recent enough to
as GROMACS evolves, complicated the GROMACS code, and wasn't regularly tested or maintained.
It might return if some of these conditions change.
+Support for CUDA compute capability 2.x removed
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+The Fermi-era GPUs (cira 2010) are no longer in widespread use, are
+not tested in Jenkins, complicated the code, and are no longer
+supported.
+
Contrib directory removed
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
This code had not been maintained in years, so likely didn't work, and
target minimum number pair-lists in order to improve multi-processor load-balance for better
performance with small simulation systems. Must be set to a non-negative integer,
the 0 value disables list splitting.
- The default value is optimized for supported GPUs (NVIDIA Fermi to Maxwell),
+ The default value is optimized for supported GPUs
therefore changing it is not necessary for normal usage, but it can be useful on future architectures.
``GMX_NBLISTCG``
float &fy,
float &fz)
{
-#if (GMX_PTX_ARCH >= 300)
if (!(order & (order - 1))) // Only for orders of power of 2
{
const unsigned int activeMask = c_fullWarpMask;
}
}
else
-#endif
{
// We use blockSize shared memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
// which are stored separately (first 2 dimensions only)
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Early return for fully empty blocks at the end
- * (should only happen on Fermi or billions of input atoms)
+ * (should only happen for billions of input atoms)
*/
if (atomIndexOffset >= kernelParams.atoms.nAtoms)
{
//! Gathering max block size in threads
constexpr int c_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * warp_size;
-//! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
-constexpr int c_gatherMinBlocksPerMP = (GMX_PTX_ARCH < 300) ? GMX_CUDA_MAX_BLOCKS_PER_MP : (GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock);
+//! Gathering min blocks per CUDA multiprocessor
+constexpr int c_gatherMinBlocksPerMP = GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock;
#endif // GMX_GPU == GMX_GPU_CUDA
initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
kernelParamsPtr->fractShiftsTableTexture,
pmeGpu->common->fsh.data(),
- newFractShiftsSize,
- pmeGpu->deviceInfo);
+ newFractShiftsSize);
initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
kernelParamsPtr->gridlineIndicesTableTexture,
pmeGpu->common->nn.data(),
- newFractShiftsSize,
- pmeGpu->deviceInfo);
+ newFractShiftsSize);
#elif GMX_GPU == GMX_GPU_OPENCL
// No dedicated texture routines....
allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize, pmeGpu->archSpecific->context);
auto *kernelParamsPtr = pmeGpu->kernelParams.get();
#if GMX_GPU == GMX_GPU_CUDA
destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
- kernelParamsPtr->fractShiftsTableTexture,
- pmeGpu->deviceInfo);
+ kernelParamsPtr->fractShiftsTableTexture);
destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
- kernelParamsPtr->gridlineIndicesTableTexture,
- pmeGpu->deviceInfo);
+ kernelParamsPtr->gridlineIndicesTableTexture);
#elif GMX_GPU == GMX_GPU_OPENCL
freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
/* Optional energy/virial reduction */
if (computeEnergyAndVirial)
{
-#if (GMX_PTX_ARCH >= 300)
/* A tricky shuffle reduction inspired by reduce_force_j_warp_shfl.
* The idea is to reduce 7 energy/virial components into a single variable (aligned by 8).
* We will reduce everything into virxx.
atomicAdd(gm_virialAndEnergy + componentIndex, output);
}
}
-#else
- /* Shared memory reduction with atomics for compute capability < 3.0.
- * Each component is first reduced into warp_size positions in the shared memory;
- * Then first c_virialAndEnergyCount warps reduce everything further and add to the global memory.
- * This can likely be improved, but is anyway faster than the previous straightforward reduction,
- * which was using too much shared memory (for storing all 7 floats on each thread).
- * [48KB (shared mem limit per SM on CC2.x) / sizeof(float) (4) / c_solveMaxThreadsPerBlock (256) / c_virialAndEnergyCount (7) ==
- * 6 blocks per SM instead of 16 which is maximum on CC2.x].
- */
-
- const int lane = threadLocalId & (warp_size - 1);
- const int warpIndex = threadLocalId / warp_size;
- const bool firstWarp = (warpIndex == 0);
- __shared__ float sm_virialAndEnergy[c_virialAndEnergyCount * warp_size];
- if (firstWarp)
- {
- sm_virialAndEnergy[0 * warp_size + lane] = virxx;
- sm_virialAndEnergy[1 * warp_size + lane] = viryy;
- sm_virialAndEnergy[2 * warp_size + lane] = virzz;
- sm_virialAndEnergy[3 * warp_size + lane] = virxy;
- sm_virialAndEnergy[4 * warp_size + lane] = virxz;
- sm_virialAndEnergy[5 * warp_size + lane] = viryz;
- sm_virialAndEnergy[6 * warp_size + lane] = energy;
- }
- __syncthreads();
- if (!firstWarp)
- {
- atomicAdd(sm_virialAndEnergy + 0 * warp_size + lane, virxx);
- atomicAdd(sm_virialAndEnergy + 1 * warp_size + lane, viryy);
- atomicAdd(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
- atomicAdd(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
- atomicAdd(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
- atomicAdd(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
- atomicAdd(sm_virialAndEnergy + 6 * warp_size + lane, energy);
- }
- __syncthreads();
-
- GMX_UNUSED_VALUE(activeWarps);
- assert(activeWarps >= c_virialAndEnergyCount); // we need to cover all components, or have multiple iterations otherwise
- const int componentIndex = warpIndex;
- if (componentIndex < c_virialAndEnergyCount)
- {
- const int targetIndex = threadLocalId;
-#pragma unroll
- for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
- {
- if (lane < reductionStride)
- {
- sm_virialAndEnergy[targetIndex] += sm_virialAndEnergy[targetIndex + reductionStride];
- }
- }
- if (lane == 0)
- {
- atomicAdd(gm_virialAndEnergy + componentIndex, sm_virialAndEnergy[targetIndex]);
- }
- }
-#endif
}
}
const int atomIndexOffset = blockIndex * atomsPerBlock;
/* Early return for fully empty blocks at the end
- * (should only happen on Fermi or billions of input atoms)
+ * (should only happen for billions of input atoms)
*/
if (atomIndexOffset >= kernelParams.atoms.nAtoms)
{
/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
*
- * Only texture objects supported, disable textures for <= CC 2.0 (but not in host code).
+ * Only texture objects supported.
* Disable texture support missing in clang (all versions up to <=5.0-dev as of writing).
*
* This option will not influence functionality. All features using textures ought
* to have fallback for texture-less reads (direct/LDG loads), all new code needs
* to provide fallback code.
*/
-#if defined(GMX_DISABLE_CUDA_TEXTURES) || (GMX_PTX_ARCH > 0 && GMX_PTX_ARCH < 300) || (defined(__clang__) && defined(__CUDA__))
+#if defined(GMX_DISABLE_CUDA_TEXTURES) || (defined(__clang__) && defined(__CUDA__))
#define DISABLE_CUDA_TEXTURES 1
#else
#define DISABLE_CUDA_TEXTURES 0
*
*/
#if GMX_PTX_ARCH > 0
- #if GMX_PTX_ARCH <= 210 // CC 2.x
- #define GMX_CUDA_MAX_BLOCKS_PER_MP 8
- #define GMX_CUDA_MAX_THREADS_PER_MP 1536
- #elif GMX_PTX_ARCH <= 370 // CC 3.x
+ #if GMX_PTX_ARCH <= 370 // CC 3.x
#define GMX_CUDA_MAX_BLOCKS_PER_MP 16
#define GMX_CUDA_MAX_THREADS_PER_MP 2048
#else // CC 5.x, 6.x
return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
}
-/*! \brief Return whether texture objects are used on this device.
- *
- * \param[in] pointer to the GPU device info structure to inspect for texture objects support
- * \return true if texture objects are used on this device
- */
-static inline bool use_texobj(const gmx_device_info_t *dev_info)
-{
- assert(!c_disableCudaTextures);
- /* Only device CC >= 3.0 (Kepler and later) support texture objects */
- return (dev_info->prop.major >= 3);
-}
-
/*! \brief Set up texture object for an array of type T.
*
* Set up texture object for an array of type T and bind it to the device memory
void initParamLookupTable(T * &d_ptr,
cudaTextureObject_t &texObj,
const T *h_ptr,
- int numElem,
- const gmx_device_info_t *devInfo)
+ int numElem)
{
const size_t sizeInBytes = numElem * sizeof(*d_ptr);
cudaError_t stat = cudaMalloc((void **)&d_ptr, sizeInBytes);
if (!c_disableCudaTextures)
{
- if (use_texobj(devInfo))
- {
- setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
- }
+ setup1DTexture<T>(texObj, d_ptr, sizeInBytes);
}
}
template <typename T>
void destroyParamLookupTable(T *d_ptr,
- cudaTextureObject_t texObj,
- const gmx_device_info_t *devInfo)
+ cudaTextureObject_t texObj)
{
if (!c_disableCudaTextures)
{
- if (use_texobj(devInfo))
- {
- CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
- }
+ CU_RET_ERR(cudaDestroyTextureObject(texObj), "cudaDestroyTextureObject on texObj failed");
}
CU_RET_ERR(cudaFree(d_ptr), "cudaFree failed");
}
* One should also verify that the result of cudaCreateChannelDesc<T>() during texture setup
* looks reasonable, when instantiating the templates for new types - just in case.
*/
-template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int, const gmx_device_info_t *);
-template void destroyParamLookupTable<float>(float *, cudaTextureObject_t, const gmx_device_info_t *);
-template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int, const gmx_device_info_t *);
-template void destroyParamLookupTable<int>(int *, cudaTextureObject_t, const gmx_device_info_t *);
+template void initParamLookupTable<float>(float * &, cudaTextureObject_t &, const float *, int);
+template void destroyParamLookupTable<float>(float *, cudaTextureObject_t);
+template void initParamLookupTable<int>(int * &, cudaTextureObject_t &, const int *, int);
+template void destroyParamLookupTable<int>(int *, cudaTextureObject_t);
* \param[out] texObj texture object to be initialized
* \param[in] h_ptr pointer to the host memory to be uploaded to the device
* \param[in] numElem number of elements in the h_ptr
- * \param[in] devInfo pointer to the info struct of the device in use
*/
template <typename T>
void initParamLookupTable(T * &d_ptr,
cudaTextureObject_t &texObj,
const T *h_ptr,
- int numElem,
- const gmx_device_info_t *devInfo);
+ int numElem);
// Add extern declarations so each translation unit understands that
// there will be a definition provided.
* \tparam[in] T Raw data type
* \param[in] d_ptr Device pointer to the memory to be deallocated
* \param[in] texObj Texture object to be deinitialized
- * \param[in] devInfo Pointer to the info struct of the device in use
*/
template <typename T>
void destroyParamLookupTable(T *d_ptr,
- cudaTextureObject_t texObj,
- const gmx_device_info_t *devInfo);
+ cudaTextureObject_t texObj);
// Add extern declarations so each translation unit understands that
// there will be a definition provided.
gmx_fatal(FARGS,
"The %s binary does not include support for the CUDA architecture "
"of the selected GPU (device ID #%d, compute capability %d.%d). "
- "By default, GROMACS supports all common architectures, so your GPU "
+ "By default, GROMACS supports all architectures of compute "
+ "capability >= 3.0, so your GPU "
"might be rare, or some architectures were disabled in the build. "
"Consult the install guide for how to use the GMX_CUDA_TARGET_SM and "
"GMX_CUDA_TARGET_COMPUTE CMake variables to add this architecture.",
}
CU_RET_ERR(stat, "cudaFuncGetAttributes failed");
-
- if (devInfo->prop.major >= 3 && attributes.ptxVersion < 30)
- {
- gmx_fatal(FARGS,
- "The GPU device code was compiled at runtime from 2.0 source which is "
- "not compatible with the selected GPU (device ID #%d, compute capability %d.%d). "
- "Pass the appropriate target in GMX_CUDA_TARGET_SM or a >=30 value to GMX_CUDA_TARGET_COMPUTE.",
- devInfo->id,
- devInfo->prop.major, devInfo->prop.minor);
- }
}
bool isHostMemoryPinned(const void *h_ptr)
*/
static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
{
- return (dev_prop->major >= 2);
+ return (dev_prop->major >= 3);
}
/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
/* cj in shared memory, for each warp separately */
shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
- if (dinfo->prop.major >= 3)
+
+ if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
+ nbp->vdwtype == evdwCuCUTCOMBLB)
{
- if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
- nbp->vdwtype == evdwCuCUTCOMBLB)
- {
- /* i-atom LJ combination parameters in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
- }
- else
- {
- /* i-atom types in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(int);
- }
+ /* i-atom LJ combination parameters in shared memory */
+ shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
}
- if (dinfo->prop.major < 3)
+ else
{
- /* force reduction buffers in shared memory */
- shmem += c_clSize * c_clSize * 3 * sizeof(float);
+ /* i-atom types in shared memory */
+ shmem += c_numClPerSupercl * c_clSize * sizeof(int);
}
+
return shmem;
}
}
}
-void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig()
{
cudaError_t stat;
{
for (int j = 0; j < evdwCuNR; j++)
{
- if (devinfo->prop.major >= 3)
- {
- /* Default kernel on sm 3.x and later 32/32 kB Shared/L1 */
- cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
- cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
- cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
- }
- else
- {
- /* On Fermi prefer L1 gives 2% higher performance */
- /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- }
+ /* Default kernel 32/32 kB Shared/L1 */
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
}
}
#ifndef GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
#define GMX_MDLIB_NBNXN_CUDA_NBNXN_CUDA_H
-#include "nbnxn_cuda_types.h"
-
//! Set up the cache configuration for the non-bonded kernels.
-void nbnxn_cuda_set_cacheconfig(const gmx_device_info_t *devinfo);
+void nbnxn_cuda_set_cacheconfig();
#endif
#include "nbnxn_cuda.h"
#include "nbnxn_cuda_types.h"
-/* This is a heuristically determined parameter for the Fermi, Kepler
+/* This is a heuristically determined parameter for the Kepler
* and Maxwell architectures for the minimum size of ci lists by multiplying
* this constant with the # of multiprocessors on the current device.
* Since the maximum number of blocks per multiprocessor is 16, the ideal
static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
/* Fw. decl, */
-static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
- const gmx_device_info_t *dev_info);
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam);
/*! \brief Return whether combination rules are used.
*
it just re-uploads the table.
*/
static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
- cu_nbparam_t *nbp,
- const gmx_device_info_t *dev_info)
+ cu_nbparam_t *nbp)
{
if (nbp->coulomb_tab != nullptr)
{
- nbnxn_cuda_free_nbparam_table(nbp, dev_info);
+ nbnxn_cuda_free_nbparam_table(nbp);
}
nbp->coulomb_tab_scale = ic->tabq_scale;
initParamLookupTable(nbp->coulomb_tab, nbp->coulomb_tab_texobj,
- ic->tabq_coul_F, ic->tabq_size, dev_info);
+ ic->tabq_coul_F, ic->tabq_size);
}
/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
earlier GPUs, single or twin cut-off. */
-static int pick_ewald_kernel_type(bool bTwinCut,
- const gmx_device_info_t *dev_info)
+static int pick_ewald_kernel_type(bool bTwinCut)
{
bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
int kernel_type;
"requested through environment variables.");
}
- /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
- if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ /* By default use analytical Ewald. */
+ bUseAnalyticalEwald = true;
+ if (bForceAnalyticalEwald)
{
- bUseAnalyticalEwald = true;
-
if (debug)
{
fprintf(debug, "Using analytical Ewald CUDA kernels\n");
}
}
- else
+ else if (bForceTabulatedEwald)
{
bUseAnalyticalEwald = false;
static void init_nbparam(cu_nbparam_t *nbp,
const interaction_const_t *ic,
const NbnxnListParameters *listParams,
- const nbnxn_atomdata_t *nbat,
- const gmx_device_info_t *dev_info)
+ const nbnxn_atomdata_t *nbat)
{
int ntypes;
else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
{
/* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
- nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
+ nbp->eeltype = pick_ewald_kernel_type(false);
}
else
{
nbp->coulomb_tab = nullptr;
if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
{
- init_ewald_coulomb_force_table(ic, nbp, dev_info);
+ init_ewald_coulomb_force_table(ic, nbp);
}
/* set up LJ parameter lookup table */
if (!useLjCombRule(nbp))
{
initParamLookupTable(nbp->nbfp, nbp->nbfp_texobj,
- nbat->nbfp, 2*ntypes*ntypes, dev_info);
+ nbat->nbfp, 2*ntypes*ntypes);
}
/* set up LJ-PME parameter lookup table */
if (ic->vdwtype == evdwPME)
{
initParamLookupTable(nbp->nbfp_comb, nbp->nbfp_comb_texobj,
- nbat->nbfp_comb, 2*ntypes, dev_info);
+ nbat->nbfp_comb, 2*ntypes);
}
}
set_cutoff_parameters(nbp, ic, listParams);
- nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
- nb->dev_info);
+ nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
- init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+ init_ewald_coulomb_force_table(ic, nb->nbparam);
}
/*! Initializes the pair list data structure. */
const nbnxn_atomdata_t *nbat)
{
init_atomdata_first(nb->atdat, nbat->ntype);
- init_nbparam(nb->nbparam, ic, listParams, nbat, nb->dev_info);
+ init_nbparam(nb->nbparam, ic, listParams, nbat);
/* clear energy and shift force outputs */
nbnxn_cuda_clear_e_fshift(nb);
/* set the kernel type for the current GPU */
/* pick L1 cache configuration */
- nbnxn_cuda_set_cacheconfig(nb->dev_info);
+ nbnxn_cuda_set_cacheconfig();
nbnxn_cuda_init_const(nb, ic, listParams, nbat);
}
}
-static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam,
- const gmx_device_info_t *dev_info)
+static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam)
{
if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
{
- destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj,
- dev_info);
+ destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
}
}
atdat = nb->atdat;
nbparam = nb->nbparam;
- nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info);
+ nbnxn_cuda_free_nbparam_table(nbparam);
stat = cudaEventDestroy(nb->nonlocal_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
if (!useLjCombRule(nb->nbparam))
{
- destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj,
- nb->dev_info);
+ destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj);
}
if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
{
- destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj,
- nb->dev_info);
+ destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
}
stat = cudaFree(atdat->shift_vec);
* code that is in double precision.
*/
-#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
-#endif
-
#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
/* Note: convenience macro, needs to be undef-ed at the end of the file. */
#define EL_EWALD_ANY
* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
* warp-pairs per block.
*
- * - On CC 2.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
+ * - On CC 3.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
* blocks/multiproc, is the fastest even though this setup gives low occupancy
* (except on 6.0).
* NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
#define THREADS_PER_BLOCK (c_clSize*c_clSize*NTHREAD_Z)
#if GMX_PTX_ARCH >= 350
-#if (GMX_PTX_ARCH <= 210) && (NTHREAD_Z > 1)
- #error NTHREAD_Z > 1 will give incorrect results on CC 2.x
-#endif
/**@}*/
__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
#else
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- * \brief
- * CUDA non-bonded kernel used through preprocessor-based code generation
- * of multiple kernel flavors for CC 2.x, see nbnxn_cuda_kernels.cuh.
- *
- * NOTE: No include fence as it is meant to be included multiple times.
- *
- * \author Szilárd Páll <pall.szilard@gmail.com>
- * \author Berk Hess <hess@kth.se>
- * \ingroup module_mdlib
- */
-
-#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
-#include "gromacs/math/utilities.h"
-#include "gromacs/pbcutil/ishift.h"
-/* Note that floating-point constants in CUDA code should be suffixed
- * with f (e.g. 0.5f), to stop the compiler producing intermediate
- * code that is in double precision.
- */
-
-#if GMX_PTX_ARCH >= 300
-#error "nbnxn_cuda_kernel_fermi.cuh included with GMX_PTX_ARCH >= 300"
-#endif
-
-#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
-/* Note: convenience macro, needs to be undef-ed at the end of the file. */
-#define EL_EWALD_ANY
-#endif
-
-#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
-/* Macro to control the calculation of exclusion forces in the kernel
- * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
- * energy terms.
- *
- * Note: convenience macro, needs to be undef-ed at the end of the file.
- */
-#define EXCLUSION_FORCES
-#endif
-
-#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
-/* Note: convenience macro, needs to be undef-ed at the end of the file. */
-#define LJ_EWALD
-#endif
-
-#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
-#define LJ_COMB
-#endif
-
-/*
- Kernel launch parameters:
- - #blocks = #pair lists, blockId = pair list Id
- - #threads = c_clSize^2
- - shmem = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
-
- Each thread calculates an i force-component taking one pair of i-j atoms.
- */
-
-/**@{*/
-/*! \brief Definition of kernel launch configuration parameters for CC 2.x.
- */
-
-/* Kernel launch bounds, 16 blocks/multiprocessor can be kept in flight. */
-#define THREADS_PER_BLOCK (c_clSize*c_clSize)
-
-__launch_bounds__(THREADS_PER_BLOCK)
-#ifdef PRUNE_NBL
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_cuda)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_cuda)
-#endif /* CALC_ENERGIES */
-#else
-#ifdef CALC_ENERGIES
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_cuda)
-#else
-__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
-#endif /* CALC_ENERGIES */
-#endif /* PRUNE_NBL */
-(const cu_atomdata_t atdat,
- const cu_nbparam_t nbparam,
- const cu_plist_t plist,
- bool bCalcFshift)
-#ifdef FUNCTION_DECLARATION_ONLY
-; /* Only do function declaration, omit the function body. */
-#else
-{
- /* convenience variables */
- const nbnxn_sci_t *pl_sci = plist.sci;
-#ifndef PRUNE_NBL
- const
-#endif
- nbnxn_cj4_t *pl_cj4 = plist.cj4;
- const nbnxn_excl_t *excl = plist.excl;
-#ifndef LJ_COMB
- const int *atom_types = atdat.atom_types;
- int ntypes = atdat.ntypes;
-#else
- const float2 *lj_comb = atdat.lj_comb;
- float2 ljcp_i, ljcp_j;
-#endif
- const float4 *xq = atdat.xq;
- float3 *f = atdat.f;
- const float3 *shift_vec = atdat.shift_vec;
- float rcoulomb_sq = nbparam.rcoulomb_sq;
-#ifdef VDW_CUTOFF_CHECK
- float rvdw_sq = nbparam.rvdw_sq;
- float vdw_in_range;
-#endif
-#ifdef LJ_EWALD
- float lje_coeff2, lje_coeff6_6;
-#endif
-#ifdef EL_RF
- float two_k_rf = nbparam.two_k_rf;
-#endif
-#ifdef EL_EWALD_ANA
- float beta2 = nbparam.ewald_beta*nbparam.ewald_beta;
- float beta3 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
-#endif
-#ifdef PRUNE_NBL
- float rlist_sq = nbparam.rlistOuter_sq;
-#endif
-
-#ifdef CALC_ENERGIES
-#ifdef EL_EWALD_ANY
- float beta = nbparam.ewald_beta;
- float ewald_shift = nbparam.sh_ewald;
-#else
- float c_rf = nbparam.c_rf;
-#endif /* EL_EWALD_ANY */
- float *e_lj = atdat.e_lj;
- float *e_el = atdat.e_el;
-#endif /* CALC_ENERGIES */
-
- /* thread/block/warp id-s */
- unsigned int tidxi = threadIdx.x;
- unsigned int tidxj = threadIdx.y;
- unsigned int tidx = threadIdx.y * blockDim.x + threadIdx.x;
- unsigned int bidx = blockIdx.x;
- unsigned int widx = tidx / warp_size; /* warp index */
-
- int sci, ci, cj,
- ai, aj,
- cij4_start, cij4_end;
-#ifndef LJ_COMB
- int typei, typej;
-#endif
- int i, jm, j4, wexcl_idx;
- float qi, qj_f,
- r2, inv_r, inv_r2;
-#if !defined LJ_COMB_LB || defined CALC_ENERGIES
- float inv_r6, c6, c12;
-#endif
-#ifdef LJ_COMB_LB
- float sigma, epsilon;
-#endif
- float int_bit,
- F_invr;
-#ifdef CALC_ENERGIES
- float E_lj, E_el;
-#endif
-#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
- float E_lj_p;
-#endif
- unsigned int wexcl, imask, mask_ji;
- float4 xqbuf;
- float3 xi, xj, rv, f_ij, fcj_buf;
- float3 fci_buf[c_numClPerSupercl]; /* i force buffer */
- nbnxn_sci_t nb_sci;
-
- /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
-
- /*********************************************************************
- * Set up shared memory pointers.
- * sm_nextSlotPtr should always be updated to point to the "next slot",
- * that is past the last point where data has been stored.
- */
- extern __shared__ char sm_dynamicShmem[];
- char *sm_nextSlotPtr = sm_dynamicShmem;
- static_assert(sizeof(char) == 1, "The shared memory offset calculation assumes that char is 1 byte");
-
- /* shmem buffer for i x+q pre-loading */
- float4 *xqib = (float4 *)sm_nextSlotPtr;
- sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
-
- /* shmem buffer for cj, for each warp separately */
- int *cjs = (int *)(sm_nextSlotPtr);
- sm_nextSlotPtr += (c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
-
- /* shmem j force buffer */
- float *f_buf = (float *)(sm_nextSlotPtr);
- sm_nextSlotPtr += (c_clSize * c_clSize * 3*sizeof(*f_buf));
- /*********************************************************************/
-
- nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */
- sci = nb_sci.sci; /* super-cluster */
- cij4_start = nb_sci.cj4_ind_start; /* first ...*/
- cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
-
- {
- /* Pre-load i-atom x and q into shared memory */
- ci = sci * c_numClPerSupercl + tidxj;
- ai = ci * c_clSize + tidxi;
-
- xqbuf = xq[ai] + shift_vec[nb_sci.shift];
- xqbuf.w *= nbparam.epsfac;
- xqib[tidxj * c_clSize + tidxi] = xqbuf;
- }
- __syncthreads();
-
- for (i = 0; i < c_numClPerSupercl; i++)
- {
- fci_buf[i] = make_float3(0.0f);
- }
-
-#ifdef LJ_EWALD
- /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
- lje_coeff2 = nbparam.ewaldcoeff_lj*nbparam.ewaldcoeff_lj;
- lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*c_oneSixth;
-#endif
-
-
-#ifdef CALC_ENERGIES
- E_lj = 0.0f;
- E_el = 0.0f;
-
-#ifdef EXCLUSION_FORCES /* Ewald or RF */
- if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*c_numClPerSupercl)
- {
- /* we have the diagonal: add the charge and LJ self interaction energy term */
- for (i = 0; i < c_numClPerSupercl; i++)
- {
-#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
- qi = xqib[i * c_clSize + tidxi].w;
- E_el += qi*qi;
-#endif
-
-#ifdef LJ_EWALD
- E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
-#endif
- }
-
- /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
-#ifdef LJ_EWALD
- E_lj /= c_clSize;
- E_lj *= 0.5f*c_oneSixth*lje_coeff6_6;
-#endif
-
-#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
- /* Correct for epsfac^2 due to adding qi^2 */
- E_el /= nbparam.epsfac*c_clSize;
-#if defined EL_RF || defined EL_CUTOFF
- E_el *= -0.5f*c_rf;
-#else
- E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
-#endif
-#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
- }
-#endif /* EXCLUSION_FORCES */
-
-#endif /* CALC_ENERGIES */
-
-#ifdef EXCLUSION_FORCES
- const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
-#endif
-
- /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
- for (j4 = cij4_start; j4 < cij4_end; j4++)
- {
- wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
- imask = pl_cj4[j4].imei[widx].imask;
- wexcl = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
-
-#ifndef PRUNE_NBL
- if (imask)
-#endif
- {
- /* Pre-load cj into shared memory on both warps separately */
- if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
- {
- cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
- }
-
- /* Unrolling this loop with pruning leads to register spilling;
- Tested with up to nvcc 7.5 */
-#if !defined PRUNE_NBL
-#pragma unroll 4
-#endif
- for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
- {
- if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
- {
- mask_ji = (1U << (jm * c_numClPerSupercl));
-
- cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize];
- aj = cj * c_clSize + tidxj;
-
- /* load j atom data */
- xqbuf = xq[aj];
- xj = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
- qj_f = xqbuf.w;
-#ifndef LJ_COMB
- typej = atom_types[aj];
-#else
- ljcp_j = lj_comb[aj];
-#endif
-
- fcj_buf = make_float3(0.0f);
-
-#if !defined PRUNE_NBL
-#pragma unroll 8
-#endif
- for (i = 0; i < c_numClPerSupercl; i++)
- {
- if (imask & mask_ji)
- {
- ci = sci * c_numClPerSupercl + i; /* i cluster index */
- ai = ci * c_clSize + tidxi; /* i atom index */
-
- /* all threads load an atom from i cluster ci into shmem! */
- xqbuf = xqib[i * c_clSize + tidxi];
- xi = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
-
- /* distance between i and j atoms */
- rv = xi - xj;
- r2 = norm2(rv);
-
-#ifdef PRUNE_NBL
- /* If _none_ of the atoms pairs are in cutoff range,
- the bit corresponding to the current
- cluster-pair in imask gets set to 0. */
- if (!__any(r2 < rlist_sq))
- {
- imask &= ~mask_ji;
- }
-#endif
-
- int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
-
- /* cutoff & exclusion check */
-#ifdef EXCLUSION_FORCES
- if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
-#else
- if ((r2 < rcoulomb_sq) * int_bit)
-#endif
- {
- /* load the rest of the i-atom parameters */
- qi = xqbuf.w;
-
-#ifndef LJ_COMB
- /* LJ 6*C6 and 12*C12 */
- typei = atom_types[ai];
- fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
-#else
- ljcp_i = lj_comb[ai];
-#ifdef LJ_COMB_GEOM
- c6 = ljcp_i.x * ljcp_j.x;
- c12 = ljcp_i.y * ljcp_j.y;
-#else
- /* LJ 2^(1/6)*sigma and 12*epsilon */
- sigma = ljcp_i.x + ljcp_j.x;
- epsilon = ljcp_i.y * ljcp_j.y;
-#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
- convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
-#endif
-#endif /* LJ_COMB_GEOM */
-#endif /* LJ_COMB */
-
- // Ensure distance do not become so small that r^-12 overflows
- r2 = max(r2, NBNXN_MIN_RSQ);
-
- inv_r = rsqrt(r2);
- inv_r2 = inv_r * inv_r;
-#if !defined LJ_COMB_LB || defined CALC_ENERGIES
- inv_r6 = inv_r2 * inv_r2 * inv_r2;
-#ifdef EXCLUSION_FORCES
- /* We could mask inv_r2, but with Ewald
- * masking both inv_r6 and F_invr is faster */
- inv_r6 *= int_bit;
-#endif /* EXCLUSION_FORCES */
-
- F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
-#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
- E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*c_oneTwelveth -
- c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*c_oneSixth);
-#endif
-#else /* !LJ_COMB_LB || CALC_ENERGIES */
- float sig_r = sigma*inv_r;
- float sig_r2 = sig_r*sig_r;
- float sig_r6 = sig_r2*sig_r2*sig_r2;
-#ifdef EXCLUSION_FORCES
- sig_r6 *= int_bit;
-#endif /* EXCLUSION_FORCES */
-
- F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
-#endif /* !LJ_COMB_LB || CALC_ENERGIES */
-
-#ifdef LJ_FORCE_SWITCH
-#ifdef CALC_ENERGIES
- calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
-#else
- calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
-#endif /* CALC_ENERGIES */
-#endif /* LJ_FORCE_SWITCH */
-
-
-#ifdef LJ_EWALD
-#ifdef LJ_EWALD_COMB_GEOM
-#ifdef CALC_ENERGIES
- calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
-#else
- calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
-#endif /* CALC_ENERGIES */
-#elif defined LJ_EWALD_COMB_LB
- calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
-#ifdef CALC_ENERGIES
- int_bit, &F_invr, &E_lj_p
-#else
- 0, &F_invr, NULL
-#endif /* CALC_ENERGIES */
- );
-#endif /* LJ_EWALD_COMB_GEOM */
-#endif /* LJ_EWALD */
-
-#ifdef LJ_POT_SWITCH
-#ifdef CALC_ENERGIES
- calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
-#else
- calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
-#endif /* CALC_ENERGIES */
-#endif /* LJ_POT_SWITCH */
-
-#ifdef VDW_CUTOFF_CHECK
- /* Separate VDW cut-off check to enable twin-range cut-offs
- * (rvdw < rcoulomb <= rlist)
- */
- vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
- F_invr *= vdw_in_range;
-#ifdef CALC_ENERGIES
- E_lj_p *= vdw_in_range;
-#endif
-#endif /* VDW_CUTOFF_CHECK */
-
-#ifdef CALC_ENERGIES
- E_lj += E_lj_p;
-#endif
-
-
-#ifdef EL_CUTOFF
-#ifdef EXCLUSION_FORCES
- F_invr += qi * qj_f * int_bit * inv_r2 * inv_r;
-#else
- F_invr += qi * qj_f * inv_r2 * inv_r;
-#endif
-#endif
-#ifdef EL_RF
- F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
-#endif
-#if defined EL_EWALD_ANA
- F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
-#elif defined EL_EWALD_TAB
- F_invr += qi * qj_f * (int_bit*inv_r2 -
- interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
-#endif /* EL_EWALD_ANA/TAB */
-
-#ifdef CALC_ENERGIES
-#ifdef EL_CUTOFF
- E_el += qi * qj_f * (int_bit*inv_r - c_rf);
-#endif
-#ifdef EL_RF
- E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
-#endif
-#ifdef EL_EWALD_ANY
- /* 1.0f - erff is faster than erfcf */
- E_el += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif /* EL_EWALD_ANY */
-#endif
- f_ij = rv * F_invr;
-
- /* accumulate j forces in registers */
- fcj_buf -= f_ij;
-
- /* accumulate i forces in registers */
- fci_buf[i] += f_ij;
- }
- }
-
- /* shift the mask bit by 1 */
- mask_ji += mask_ji;
- }
-
- /* reduce j forces */
- /* store j forces in shmem */
- f_buf[ tidx] = fcj_buf.x;
- f_buf[ c_fbufStride + tidx] = fcj_buf.y;
- f_buf[2 * c_fbufStride + tidx] = fcj_buf.z;
-
- reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
- }
- }
-#ifdef PRUNE_NBL
- /* Update the imask with the new one which does not contain the
- out of range clusters anymore. */
- pl_cj4[j4].imei[widx].imask = imask;
-#endif
- }
- }
-
- /* skip central shifts when summing shift forces */
- if (nb_sci.shift == CENTRAL)
- {
- bCalcFshift = false;
- }
-
- float fshift_buf = 0.0f;
-
- /* reduce i forces */
- for (i = 0; i < c_numClPerSupercl; i++)
- {
- ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
- f_buf[ tidx] = fci_buf[i].x;
- f_buf[ c_fbufStride + tidx] = fci_buf[i].y;
- f_buf[2 * c_fbufStride + tidx] = fci_buf[i].z;
- __syncthreads();
- reduce_force_i(f_buf, f,
- &fshift_buf, bCalcFshift,
- tidxi, tidxj, ai);
- __syncthreads();
- }
-
- /* add up local shift forces into global mem, tidxj indexes x,y,z */
- if (bCalcFshift && tidxj < 3)
- {
- atomicAdd(&(atdat.fshift[nb_sci.shift].x) + tidxj, fshift_buf);
- }
-
-#ifdef CALC_ENERGIES
- /* flush the energies to shmem and reduce them */
- f_buf[ tidx] = E_lj;
- f_buf[c_fbufStride + tidx] = E_el;
- reduce_energy_pow2(f_buf + (tidx & warp_size), e_lj, e_el, tidx & ~warp_size);
-#endif
-}
-#endif /* FUNCTION_DECLARATION_ONLY */
-
-#undef THREADS_PER_BLOCK
-
-#undef EL_EWALD_ANY
-#undef EXCLUSION_FORCES
-#undef LJ_EWALD
-
-#undef LJ_COMB
}
/*! Final j-force reduction; this implementation only with power of two
- * array sizes and with sm >= 3.0
+ * array sizes.
*/
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_j_warp_shfl(float3 f, float3 *fout,
int tidxi, int aidx,
atomicAdd((&fout[aidx].x) + tidxi, f.x);
}
}
-#endif
/*! Final i-force reduction; this generic implementation works with
* arbitrary array sizes.
}
/*! Final i-force reduction; this implementation works only with power of two
- * array sizes and with sm >= 3.0
+ * array sizes.
*/
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
float *fshift_buf, bool bCalcFshift,
}
}
}
-#endif
/*! Energy reduction; this implementation works only with power of two
* array sizes.
}
/*! Energy reduction; this implementation works only with power of two
- * array sizes and with sm >= 3.0
+ * array sizes.
*/
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_energy_warp_shfl(float E_lj, float E_el,
float *e_lj, float *e_el,
atomicAdd(e_el, E_el);
}
}
-#endif /* GMX_PTX_ARCH */
#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
* \ingroup module_mdlib
*/
-/* Use the standard (non-Fermi) kernel in host pass too. */
-#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
-#define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
-#else
-#define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh"
-#endif
-
/* Analytical plain cut-off electrostatics kernels
*/
#define EL_CUTOFF
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w geometric combination rules */
#define LJ_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* cut-off + V shift LJ w LB combination rules */
#define LJ_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w geometric combination rules */
#define LJ_EWALD_COMB_GEOM
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_GEOM
#undef NB_KERNEL_FUNC_NAME
/* LJ-Ewald w LB combination rules */
#define LJ_EWALD_COMB_LB
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_EWALD_COMB_LB
#undef NB_KERNEL_FUNC_NAME
/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
-#include FLAVOR_LEVEL_GENERATOR
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
* TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
*
*/
- //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
- if (!bForceTabulatedEwald)
+ /* By default use analytical Ewald. */
+ bUseAnalyticalEwald = true;
+ if (bForceAnalyticalEwald)
{
- bUseAnalyticalEwald = true;
-
if (debug)
{
fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
}
}
- else
+ else if (bForceTabulatedEwald)
{
bUseAnalyticalEwald = false;