* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+/*! \file
+ * \brief Define CUDA implementation of nbnxn_gpu.h
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
-#include <stdlib.h>
#include <assert.h>
+#include <stdlib.h>
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
#if defined(_MSVC)
#include <limits>
#include <cuda.h>
-#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
-#include "types/ishift.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-
#ifdef TMPI_ATOMICS
#include "thread_mpi/atomic.h"
#endif
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+
#include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
#define USE_TEXOBJ
#endif
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight (and slightly lower performance).
+ * - On CC 3.7 there are enough registers to double the number of threads; using
+ * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ * with low-register use).
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7: NTHREAD_Z=2, (128, 16) bounds
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z (2)
+#define MIN_BLOCKS_PER_MP (16)
+#else
+#define NTHREAD_Z (1)
+#define MIN_BLOCKS_PER_MP (16)
+#endif
+#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
+
/***** The kernels come here *****/
-#include "nbnxn_cuda_kernel_utils.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
/* Top-level kernel generation: will generate through multiple inclusion the
* following flavors for all kernels:
* - force and energy output with pair list pruning.
*/
/** Force only **/
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
/*** Pair-list pruning kernels ***/
/** Force only **/
#define PRUNE_NBL
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
#undef PRUNE_NBL
+
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
const cu_nbparam_t,
static unsigned int poll_wait_pattern = (0x7FU << 23);
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
{
int max_grid_x_size;
}
/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(const int num_threads_z)
{
int shmem;
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* cj in shared memory, for both warps separately */
- shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ /* cj in shared memory, for each warp separately */
+ shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
#ifdef IATYPE_SHMEM
/* i-atom types in shared memory */
shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int iloc)
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int iloc)
{
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_nbparam_t *nbp = cu_nb->nbparam;
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *t = cu_nb->timers;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_nbparam_t *nbp = nb->nbparam;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *t = nb->timers;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- bool bDoTime = cu_nb->bDoTime;
+ bool bDoTime = nb->bDoTime;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* When we get here all misc operations issues in the local stream are done,
so we record that in the local stream and wait for it in the nonlocal one. */
- if (cu_nb->bUseTwoStreams)
+ if (nb->bUseTwoStreams)
{
if (iloc == eintLocal)
{
- stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+ stat = cudaEventRecord(nb->misc_ops_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
}
else
{
- stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
}
}
bCalcEner,
plist->bDoPrune || always_prune);
- /* kernel launch config */
- nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
- dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ int num_threads_z = 1;
+ if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
+ {
+ num_threads_z = 2;
+ }
+ nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
+ dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required();
+ shmem = calc_shmem_required(num_threads_z);
if (debug)
{
fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
- "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %d\n",
dim_block.x, dim_block.y, dim_block.z,
dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
- NCL_PER_SUPERCL, plist->na_c);
+ NCL_PER_SUPERCL, plist->na_c,
+ shmem);
}
nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
}
}
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc)
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int aloc)
{
cudaError_t stat;
int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
gmx_incons(stmp);
}
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_timers_t *t = cu_nb->timers;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch non-local copy-back if there was no non-local work to do */
- if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
{
return;
}
{
adat_begin = 0;
adat_len = adat->natoms_local;
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
adat_begin = adat->natoms_local;
adat_len = adat->natoms - adat->natoms_local;
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
/* beginning of timed D2H section */
CU_RET_ERR(stat, "cudaEventRecord failed");
}
- if (!cu_nb->bUseStreamSync)
+ if (!nb->bUseStreamSync)
{
/* For safety reasons set a few (5%) forces to NaN. This way even if the
polling "hack" fails with some future NVIDIA driver we'll get a crash. */
/* With DD the local D2H transfer can only start after the non-local
kernel has finished. */
- if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+ if (iloc == eintLocal && nb->bUseTwoStreams)
{
- stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
back first. */
if (iloc == eintNonlocal)
{
- stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+ stat = cudaEventRecord(nb->nonlocal_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
}
/* DtoH fshift */
if (bCalcFshift)
{
- cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
- SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+ cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
+ SHIFTS * sizeof(*nb->nbst.fshift), stream);
}
/* DtoH energies */
if (bCalcEner)
{
- cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
- sizeof(*cu_nb->nbst.e_lj), stream);
- cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
- sizeof(*cu_nb->nbst.e_el), stream);
+ cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
+ sizeof(*nb->nbst.e_lj), stream);
+ cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
+ sizeof(*nb->nbst.e_el), stream);
}
}
#endif
}
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags, int aloc,
- real *e_lj, real *e_el, rvec *fshift)
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags, int aloc,
+ real *e_lj, real *e_el, rvec *fshift)
{
/* NOTE: only implemented for single-precision at this time */
cudaError_t stat;
gmx_incons(stmp);
}
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *timers = cu_nb->timers;
- wallclock_gpu_t *timings = cu_nb->timings;
- nb_staging nbst = cu_nb->nbst;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_t *timings = nb->timings;
+ nb_staging nbst = nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
- if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
{
return;
}
/* calculate the atom data index range based on locality */
if (LOCAL_A(aloc))
{
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
- if (cu_nb->bUseStreamSync)
+ if (nb->bUseStreamSync)
{
- stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+ stat = cudaStreamSynchronize(nb->stream[iloc]);
CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
}
else
}
/* timing data accumulation */
- if (cu_nb->bDoTime)
+ if (nb->bDoTime)
{
/* only increase counter once (at local F wait) */
if (LOCAL_I(iloc))
/*! Set up the cache configuration for the non-bonded kernels,
*/
-void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
{
cudaError_t stat;
if (devinfo->prop.major >= 3)
{
/* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
}
else
{
/* On Fermi prefer L1 gives 2% higher performance */
/* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
}
CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");