* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+/*! \file
+ * \brief Define CUDA implementation of nbnxn_gpu.h
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
-#include <stdlib.h>
#include <assert.h>
+#include <stdlib.h>
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
#if defined(_MSVC)
#include <limits>
#include <cuda.h>
-#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
-#include "types/ishift.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-
#ifdef TMPI_ATOMICS
#include "thread_mpi/atomic.h"
#endif
-#include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
-#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
-#define USE_TEXOBJ
-#endif
+#include "nbnxn_cuda_types.h"
/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
texture<float, 1, cudaReadModeElementType> nbfp_texref;
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight (and slightly lower performance).
+ * - On CC 3.7 there are enough registers to double the number of threads; using
+ * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ * with low-register use).
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7: NTHREAD_Z=2, (128, 16) bounds
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z (2)
+#define MIN_BLOCKS_PER_MP (16)
+#else
+#define NTHREAD_Z (1)
+#define MIN_BLOCKS_PER_MP (16)
+#endif
+#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
+
/***** The kernels come here *****/
-#include "nbnxn_cuda_kernel_utils.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
/* Top-level kernel generation: will generate through multiple inclusion the
* following flavors for all kernels:
* - force and energy output with pair list pruning.
*/
/** Force only **/
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
/*** Pair-list pruning kernels ***/
/** Force only **/
#define PRUNE_NBL
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
#undef PRUNE_NBL
+
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
const cu_nbparam_t,
static unsigned int poll_wait_pattern = (0x7FU << 23);
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
{
int max_grid_x_size;
}
/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo)
{
int shmem;
+ assert(dinfo);
+
/* size of shmem (force-buffers/xq/atom type preloading) */
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* cj in shared memory, for both warps separately */
- shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
-#ifdef IATYPE_SHMEM
- /* i-atom types in shared memory */
- shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
-#endif
-#if __CUDA_ARCH__ < 300
- /* force reduction buffers in shared memory */
- shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ /* cj in shared memory, for each warp separately */
+ shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ /* CUDA versions below 4.2 won't generate code for sm>=3.0 */
+#if GMX_CUDA_VERSION >= 4200
+ if (dinfo->prop.major >= 3)
+ {
+ /* i-atom types in shared memory */
+ shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
+ }
+ if (dinfo->prop.major < 3)
#endif
-
+ {
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
+ }
return shmem;
}
the local x+q H2D (and all preceding) tasks are complete and synchronize
with this event in the non-local stream before launching the non-bonded kernel.
*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int iloc)
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int iloc)
{
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_nbparam_t *nbp = cu_nb->nbparam;
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *t = cu_nb->timers;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_nbparam_t *nbp = nb->nbparam;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *t = nb->timers;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- bool bDoTime = cu_nb->bDoTime;
+ bool bDoTime = nb->bDoTime;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* When we get here all misc operations issues in the local stream as well as
the local xq H2D are done,
so we record that in the local stream and wait for it in the nonlocal one. */
- if (cu_nb->bUseTwoStreams)
+ if (nb->bUseTwoStreams)
{
if (iloc == eintLocal)
{
- stat = cudaEventRecord(cu_nb->misc_ops_and_local_H2D_done, stream);
+ stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
}
else
{
- stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_and_local_H2D_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
}
}
bCalcEner,
plist->bDoPrune || always_prune);
- /* kernel launch config */
- nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
- dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ int num_threads_z = 1;
+ if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
+ {
+ num_threads_z = 2;
+ }
+ nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
+ dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required();
+ shmem = calc_shmem_required(num_threads_z, nb->dev_info);
if (debug)
{
fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
- "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %d\n",
dim_block.x, dim_block.y, dim_block.z,
dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
- NCL_PER_SUPERCL, plist->na_c);
+ NCL_PER_SUPERCL, plist->na_c,
+ shmem);
}
nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
}
}
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc)
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int aloc)
{
cudaError_t stat;
int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
gmx_incons(stmp);
}
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_timers_t *t = cu_nb->timers;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch non-local copy-back if there was no non-local work to do */
- if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
{
return;
}
{
adat_begin = 0;
adat_len = adat->natoms_local;
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
adat_begin = adat->natoms_local;
adat_len = adat->natoms - adat->natoms_local;
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
/* beginning of timed D2H section */
CU_RET_ERR(stat, "cudaEventRecord failed");
}
- if (!cu_nb->bUseStreamSync)
+ if (!nb->bUseStreamSync)
{
/* For safety reasons set a few (5%) forces to NaN. This way even if the
polling "hack" fails with some future NVIDIA driver we'll get a crash. */
/* With DD the local D2H transfer can only start after the non-local
kernel has finished. */
- if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+ if (iloc == eintLocal && nb->bUseTwoStreams)
{
- stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
back first. */
if (iloc == eintNonlocal)
{
- stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+ stat = cudaEventRecord(nb->nonlocal_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
}
/* DtoH fshift */
if (bCalcFshift)
{
- cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
- SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+ cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
+ SHIFTS * sizeof(*nb->nbst.fshift), stream);
}
/* DtoH energies */
if (bCalcEner)
{
- cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
- sizeof(*cu_nb->nbst.e_lj), stream);
- cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
- sizeof(*cu_nb->nbst.e_el), stream);
+ cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
+ sizeof(*nb->nbst.e_lj), stream);
+ cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
+ sizeof(*nb->nbst.e_el), stream);
}
}
#endif
}
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags, int aloc,
- real *e_lj, real *e_el, rvec *fshift)
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags, int aloc,
+ real *e_lj, real *e_el, rvec *fshift)
{
/* NOTE: only implemented for single-precision at this time */
cudaError_t stat;
gmx_incons(stmp);
}
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *timers = cu_nb->timers;
- wallclock_gpu_t *timings = cu_nb->timings;
- nb_staging nbst = cu_nb->nbst;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_t *timings = nb->timings;
+ nb_staging nbst = nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
- if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
{
return;
}
/* calculate the atom data index range based on locality */
if (LOCAL_A(aloc))
{
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
- if (cu_nb->bUseStreamSync)
+ if (nb->bUseStreamSync)
{
- stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+ stat = cudaStreamSynchronize(nb->stream[iloc]);
CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
}
else
}
/* timing data accumulation */
- if (cu_nb->bDoTime)
+ if (nb->bDoTime)
{
/* only increase counter once (at local F wait) */
if (LOCAL_I(iloc))
/*! Set up the cache configuration for the non-bonded kernels,
*/
-void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
{
cudaError_t stat;
if (devinfo->prop.major >= 3)
{
/* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
}
else
{
/* On Fermi prefer L1 gives 2% higher performance */
/* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
}
CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");