*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* /Erik 001109
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "gromacs/utility/smalloc.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "macros.h"
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gromacs/ewald/pme-internal.h"
+#include "gromacs/fft/fft.h"
#include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/fileio/futil.h"
-#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/forcerec.h"
+#include "gromacs/legacyheaders/types/inputrec.h"
+#include "gromacs/legacyheaders/types/nrnb.h"
#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+/* Include the SIMD macro file and then check for support */
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
#include "gromacs/timing/cyclecounter.h"
#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#ifdef DEBUG_PME
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/futil.h"
+#endif
-/* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/simd.h"
-#include "gromacs/simd/simd_math.h"
#ifdef GMX_SIMD_HAVE_REAL
/* Turn on arbitrary width SIMD intrinsics for PME solve */
# define PME_SIMD_SOLVE
int recv_size; /* Receive buffer width, used with OpenMP */
} pme_grid_comm_t;
+typedef real *splinevec[DIM];
+
typedef struct {
#ifdef GMX_MPI
MPI_Comm mpi_comm;
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
#else
DO_BSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_SPREAD_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
#else
DO_BSPLINE(5);
#endif
static void realloc_work(pme_work_t *work, int nkx)
{
- int simd_width;
+ int simd_width, i;
if (nkx > work->nalloc)
{
snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real));
snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
srenew(work->m2inv, work->nalloc);
+#ifndef NDEBUG
+ for (i = 0; i < work->nalloc+simd_width; i++)
+ {
+ work->denom[i] = 1; /* init to 1 to avoid 1/0 exceptions of simd padded elements */
+ }
+#endif
}
}
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 4
#endif
-#include "pme_simd4.h"
+#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
#else
DO_FSPLINE(4);
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
#define PME_GATHER_F_SIMD4_ALIGNED
#define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
#else
DO_FSPLINE(5);
#endif
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
- ivec localcopy_end;
+ ivec localcopy_end, commcopy_end;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
* not beyond the local FFT grid.
*/
localcopy_end[d] =
- min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
local_fft_ndata[d]);
+
+ /* Determine up to where our thread needs to copy from the
+ * thread-local charge spreading grid to the communication buffer.
+ * Note: only relevant with communication, ignored otherwise.
+ */
+ commcopy_end[d] = localcopy_end[d];
+ if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
+ {
+ /* The last thread should copy up to the last pme grid line.
+ * When the rank-local FFT grid is narrower than pme-order,
+ * we need the max below to ensure copying of all data.
+ */
+ commcopy_end[d] = max(commcopy_end[d], pme->pme_order);
+ }
}
offx = pmegrid->offset[XX];
}
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
- /* Determine the end of our part of the source grid */
- if (!bCommX)
- {
- /* Use our thread local source grid and target grid part */
- tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
- }
- else
- {
- /* Use our thread local source grid and the spreading range */
- tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
- }
+ /* Determine the end of our part of the source grid.
+ * Use our thread local source grid and target grid part
+ */
+ tx1 = min(ox + pmegrid_g->n[XX],
+ !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
{
}
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
- /* Determine the end of our part of the source grid */
- if (!bCommY)
- {
- /* Use our thread local source grid and target grid part */
- ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
- }
- else
- {
- /* Use our thread local source grid and the spreading range */
- ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
- }
+ /* Determine the end of our part of the source grid.
+ * Use our thread local source grid and target grid part
+ */
+ ty1 = min(oy + pmegrid_g->n[YY],
+ !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
{
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "nbnxn_cuda.h"
+
+#include "config.h"
-#include <stdlib.h>
#include <assert.h>
+#include <stdlib.h>
#if defined(_MSVC)
#include <limits>
#include <cuda.h>
-#include "types/simple.h"
-#include "types/nbnxn_pairlist.h"
-#include "types/nb_verlet.h"
-#include "types/ishift.h"
-#include "types/force_flags.h"
-#include "../nbnxn_consts.h"
-
#ifdef TMPI_ATOMICS
#include "thread_mpi/atomic.h"
#endif
+#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+
#include "nbnxn_cuda_types.h"
-#include "../../gmxlib/cuda_tools/cudautils.cuh"
-#include "nbnxn_cuda.h"
-#include "nbnxn_cuda_data_mgmt.h"
#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
#define USE_TEXOBJ
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ * warp-pairs per block.
+ *
+ * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ * per multiprocessor is reduced proportionally to get the original number of max
+ * threads in flight (and slightly lower performance).
+ * - On CC 3.7 there are enough registers to double the number of threads; using
+ * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ * with low-register use).
+ *
+ * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ * shuffle-based reduction, hence CC >= 3.0.
+ */
+
+/* Kernel launch bounds as function of NTHREAD_Z.
+ * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7: NTHREAD_Z=2, (128, 16) bounds
+ */
+#if __CUDA_ARCH__ == 370
+#define NTHREAD_Z (2)
+#define MIN_BLOCKS_PER_MP (16)
+#else
+#define NTHREAD_Z (1)
+#define MIN_BLOCKS_PER_MP (16)
+#endif
+#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
+
+
/***** The kernels come here *****/
-#include "nbnxn_cuda_kernel_utils.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
/* Top-level kernel generation: will generate through multiple inclusion the
* following flavors for all kernels:
* - force and energy output with pair list pruning.
*/
/** Force only **/
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
/*** Pair-list pruning kernels ***/
/** Force only **/
#define PRUNE_NBL
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
/** Force & energy **/
#define CALC_ENERGIES
-#include "nbnxn_cuda_kernels.cuh"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
#undef CALC_ENERGIES
#undef PRUNE_NBL
}
/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required()
+static inline int calc_shmem_required(const int num_threads_z)
{
int shmem;
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- /* cj in shared memory, for both warps separately */
- shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ /* cj in shared memory, for each warp separately */
+ shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
#ifdef IATYPE_SHMEM
/* i-atom types in shared memory */
shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
cu_timers_t *t = cu_nb->timers;
cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
bool bDoTime = cu_nb->bDoTime;
bCalcEner,
plist->bDoPrune || always_prune);
- /* kernel launch config */
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ int num_threads_z = 1;
+ if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
+ {
+ num_threads_z = 2;
+ }
nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
- dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required();
+ shmem = calc_shmem_required(num_threads_z);
if (debug)
{
fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
- "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %d\n",
dim_block.x, dim_block.y, dim_block.z,
dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
- NCL_PER_SUPERCL, plist->na_c);
+ NCL_PER_SUPERCL, plist->na_c,
+ shmem);
}
nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
bool bDoTime = cu_nb->bDoTime;
cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch copy-back if there was no work to do */
wallclock_gpu_t *timings = cu_nb->timings;
nb_staging nbst = cu_nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
if (devinfo->prop.major >= 3)
{
/* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
}
else
{
/* On Fermi prefer L1 gives 2% higher performance */
/* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
}
CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");