This simplifies cmake-time requirements as we can now require the same
CUDA version for all cases except MSVC. It also allows us to
unconditionally compile with the cudaLaunchKernel API and remove the
alternative chevron-notation at least in the nonbonded module.
Change-Id: I199f17151cb227fda9d76c1bf0d7fd73f1d31275
option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
# Decide on GPU settings based on user-settings and GPU/CUDA
option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
# Decide on GPU settings based on user-settings and GPU/CUDA
-# detection. GCC 4.8 requires CUDA 6.0 (but we choose 6.5 for the
-# preliminary C++11 support), icc 15 requires CUDA 7.0, and VS2015
-# requires CUDA 8.0
+# detection. VS2015 requires CUDA 8.0, for the other arch/compilers
+# rest we require CUDA 7.0 or later (including for clang-CUDA).
if(MSVC)
set(REQUIRED_CUDA_VERSION 8.0)
if(MSVC)
set(REQUIRED_CUDA_VERSION 8.0)
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
- set(REQUIRED_CUDA_VERSION 7.0)
- set(REQUIRED_CUDA_VERSION 6.5)
+ set(REQUIRED_CUDA_VERSION 7.0)
endif()
set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
endif()
set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
# Test MPI with CUDA
# Test MPMD PME with library MPI
# Test recent cmake (3.7+), to cover minor FindCUDA changes from 3.7.0
# Test MPI with CUDA
# Test MPMD PME with library MPI
# Test recent cmake (3.7+), to cover minor FindCUDA changes from 3.7.0
-gcc-4.8 gpu cuda-6.5 cmake-3.8.1 mpi npme=1 nranks=2 openmp
+gcc-4.8 gpu cuda-7.0 cmake-3.8.1 mpi npme=1 nranks=2 openmp
# Test non-default use of mdrun -gpu_id
# Test newest gcc supported by newest CUDA at time of release
# Test non-default use of mdrun -gpu_id
# Test newest gcc supported by newest CUDA at time of release
option(GMX_GPU "Enable GPU acceleration" OFF)
option(GMX_CLANG_CUDA "Use clang for CUDA" OFF)
option(GMX_GPU "Enable GPU acceleration" OFF)
option(GMX_CLANG_CUDA "Use clang for CUDA" OFF)
-if (GMX_CLANG_CUDA)
- # CUDA 7.0 or later required, override req. version
- set(REQUIRED_CUDA_VERSION 7.0)
-endif()
if(GMX_GPU AND GMX_DOUBLE)
message(FATAL_ERROR "GPU acceleration is not available in double precision!")
if(GMX_GPU AND GMX_DOUBLE)
message(FATAL_ERROR "GPU acceleration is not available in double precision!")
|Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
On Linux, NVIDIA CUDA_ toolkit with minimum version |REQUIRED_CUDA_VERSION|
|Gromacs| has excellent support for NVIDIA GPUs supported via CUDA.
On Linux, NVIDIA CUDA_ toolkit with minimum version |REQUIRED_CUDA_VERSION|
-is required, and the latest
-version is strongly encouraged. Using Intel or Microsoft MSVC compilers
-requires version 7.0 and 8.0, respectively. NVIDIA GPUs with at
+is required, and the latest version is strongly encouraged. Using
+Microsoft MSVC compiler requires version 8.0. NVIDIA GPUs with at
least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
required. You are strongly recommended to
get the latest CUDA version and driver that supports your hardware, but
least NVIDIA compute capability |REQUIRED_CUDA_COMPUTE_CAPABILITY| are
required. You are strongly recommended to
get the latest CUDA version and driver that supports your hardware, but
``GMX_CUDA_NB_TAB_EWALD``
force the use of tabulated Ewald kernels. Should be used only for benchmarking.
``GMX_CUDA_NB_TAB_EWALD``
force the use of tabulated Ewald kernels. Should be used only for benchmarking.
-``GMX_DISABLE_CUDALAUNCH``
- disable the use of the lower-latency cudaLaunchKernel API even when supported (CUDA >=v7.0).
- Should only be used for benchmarking purposes.
-
``GMX_DISABLE_CUDA_TIMING``
Deprecated. Use ``GMX_DISABLE_GPU_TIMING`` instead.
``GMX_DISABLE_CUDA_TIMING``
Deprecated. Use ``GMX_DISABLE_GPU_TIMING`` instead.
/*
* This file is part of the GROMACS molecular simulation package.
*
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/utility/exceptions.h"
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/utility/exceptions.h"
//! Solving kernel max block size in threads
constexpr int c_solveMaxThreadsPerBlock = (c_solveMaxWarpsPerBlock * warp_size);
//! Solving kernel max block size in threads
constexpr int c_solveMaxThreadsPerBlock = (c_solveMaxWarpsPerBlock * warp_size);
-// CUDA 6.5 can not compile enum class as a template kernel parameter,
-// so we replace it with a duplicate simple enum
-#if GMX_CUDA_VERSION >= 7000
-using GridOrderingInternal = GridOrdering;
-#else
-enum GridOrderingInternal
-{
- YZX,
- XYZ
-};
-#endif
-
/*! \brief
* PME complex grid solver kernel function.
*
/*! \brief
* PME complex grid solver kernel function.
*
* \param[in] kernelParams Input PME CUDA data in constant memory.
*/
template<
* \param[in] kernelParams Input PME CUDA data in constant memory.
*/
template<
- GridOrderingInternal gridOrdering,
+ GridOrdering gridOrdering,
bool computeEnergyAndVirial
>
__launch_bounds__(c_solveMaxThreadsPerBlock)
bool computeEnergyAndVirial
>
__launch_bounds__(c_solveMaxThreadsPerBlock)
int majorDim, middleDim, minorDim;
switch (gridOrdering)
{
int majorDim, middleDim, minorDim;
switch (gridOrdering)
{
- case GridOrderingInternal::YZX:
+ case GridOrdering::YZX:
majorDim = YY;
middleDim = ZZ;
minorDim = XX;
break;
majorDim = YY;
middleDim = ZZ;
minorDim = XX;
break;
- case GridOrderingInternal::XYZ:
+ case GridOrdering::XYZ:
majorDim = XX;
middleDim = YY;
minorDim = ZZ;
majorDim = XX;
middleDim = YY;
minorDim = ZZ;
const int kMiddle = indexMiddle + localOffsetMiddle;
float mMiddle = kMiddle;
/* Checking Y in XYZ case */
const int kMiddle = indexMiddle + localOffsetMiddle;
float mMiddle = kMiddle;
/* Checking Y in XYZ case */
- if (gridOrdering == GridOrderingInternal::XYZ)
+ if (gridOrdering == GridOrdering::XYZ)
{
mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
}
const int kMinor = localOffsetMinor + indexMinor;
float mMinor = kMinor;
/* Checking X in YZX case */
{
mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
}
const int kMinor = localOffsetMinor + indexMinor;
float mMinor = kMinor;
/* Checking X in YZX case */
- if (gridOrdering == GridOrderingInternal::YZX)
+ if (gridOrdering == GridOrdering::YZX)
{
mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
}
{
mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
}
float mX, mY, mZ;
switch (gridOrdering)
{
float mX, mY, mZ;
switch (gridOrdering)
{
- case GridOrderingInternal::YZX:
+ case GridOrdering::YZX:
mX = mMinor;
mY = mMajor;
mZ = mMiddle;
break;
mX = mMinor;
mY = mMajor;
mZ = mMiddle;
break;
- case GridOrderingInternal::XYZ:
+ case GridOrdering::XYZ:
mX = mMajor;
mY = mMiddle;
mZ = mMinor;
mX = mMajor;
mY = mMiddle;
mZ = mMinor;
float corner_fac = 1.0f;
switch (gridOrdering)
{
float corner_fac = 1.0f;
switch (gridOrdering)
{
- case GridOrderingInternal::YZX:
+ case GridOrdering::YZX:
if ((kMiddle == 0) | (kMiddle == maxkMiddle))
{
corner_fac = 0.5f;
}
break;
if ((kMiddle == 0) | (kMiddle == maxkMiddle))
{
corner_fac = 0.5f;
}
break;
- case GridOrderingInternal::XYZ:
+ case GridOrdering::XYZ:
if ((kMinor == 0) | (kMinor == maxkMinor))
{
corner_fac = 0.5f;
if ((kMinor == 0) | (kMinor == maxkMinor))
{
corner_fac = 0.5f;
{
if (computeEnergyAndVirial)
{
{
if (computeEnergyAndVirial)
{
- pme_solve_kernel<GridOrderingInternal::YZX, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
+ pme_solve_kernel<GridOrdering::YZX, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- pme_solve_kernel<GridOrderingInternal::YZX, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
+ pme_solve_kernel<GridOrdering::YZX, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
}
}
else if (gridOrdering == GridOrdering::XYZ)
{
if (computeEnergyAndVirial)
{
}
}
else if (gridOrdering == GridOrdering::XYZ)
{
if (computeEnergyAndVirial)
{
- pme_solve_kernel<GridOrderingInternal::XYZ, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
+ pme_solve_kernel<GridOrdering::XYZ, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- pme_solve_kernel<GridOrderingInternal::XYZ, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
+ pme_solve_kernel<GridOrdering::XYZ, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
}
}
CU_LAUNCH_ERR("pme_solve_kernel");
}
}
CU_LAUNCH_ERR("pme_solve_kernel");
/*********************************/
/*********************************/
-/* XXX switch between chevron and cudaLaunch (supported only in CUDA >=7.0)
- -- only for benchmarking purposes */
-static const bool bUseCudaLaunchKernel =
- (GMX_CUDA_VERSION >= 7000) && (getenv("GMX_DISABLE_CUDALAUNCH") == NULL);
-
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
{
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
static inline int calc_nb_kernel_nblock(int nwork_units, const gmx_device_info_t *dinfo)
{
- if (bUseCudaLaunchKernel)
- {
- gmx_unused void* kernel_args[4];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &bCalcFshift;
+ void* kernel_args[4];
+ kernel_args[0] = adat;
+ kernel_args[1] = nbp;
+ kernel_args[2] = plist;
+ kernel_args[3] = &bCalcFshift;
-#if GMX_CUDA_VERSION >= 7000
- cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
-#endif
- }
- else
- {
- nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
- }
+ cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
CU_LAUNCH_ERR("k_calc_nb");
if (bDoTime)
CU_LAUNCH_ERR("k_calc_nb");
if (bDoTime)
- if (bUseCudaLaunchKernel)
- {
- gmx_unused void* kernel_args[5];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &numParts;
- kernel_args[4] = ∂
+ void* kernel_args[5];
+ kernel_args[0] = adat;
+ kernel_args[1] = nbp;
+ kernel_args[2] = plist;
+ kernel_args[3] = &numParts;
+ kernel_args[4] = ∂
-#if GMX_CUDA_VERSION >= 7000
- if (plist->haveFreshList)
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
- else
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
-#endif
+ if (plist->haveFreshList)
+ {
+ cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
- if (plist->haveFreshList)
- {
- nbnxn_kernel_prune_cuda<true><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
- }
- else
- {
- nbnxn_kernel_prune_cuda<false><<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, numParts, part);
- }
+ cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
}
CU_LAUNCH_ERR("k_pruneonly");
}
CU_LAUNCH_ERR("k_pruneonly");