+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-/*! \internal \file
- *
- * \brief
- * Declares constants for the module
- *
- * \author Berk Hess <hess@kth.se>
- * \ingroup module_nbnxm
- */
-
-#ifndef GMX_NBNXN_CONSTANTS_H
-#define GMX_NBNXN_CONSTANTS_H
-
-/*! \brief Lower limit for square interaction distances in nonbonded kernels.
- *
- * For smaller values we will overflow when calculating r^-1 or r^-12, but
- * to keep it simple we always apply the limit from the tougher r^-12 condition.
- */
-#if GMX_DOUBLE
-// Some double precision SIMD architectures use single precision in the first
-// step, so although the double precision criterion would allow smaller rsq,
-// we need to stay in single precision with some margin for the N-R iterations.
-# define NBNXN_MIN_RSQ 1.0e-36
-#else
-// The worst intermediate value we might evaluate is r^-12, which
-// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin)
-# define NBNXN_MIN_RSQ 3.82e-07f // r > 6.2e-4
-#endif
-
-
-//! The number of clusters in a super-cluster, used for GPU
-#define c_nbnxnGpuNumClusterPerSupercluster 8
-
-/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
- * of integers containing 32 bits.
- */
-#define c_nbnxnGpuJgroupSize (32 / c_nbnxnGpuNumClusterPerSupercluster)
-
-#endif
/* size of shmem (force-buffers/xq/atom type preloading) */
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
/* cj in shared memory, for each warp separately */
shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
if (nbp->vdwtype == evdwCuCUTCOMBGEOM || nbp->vdwtype == evdwCuCUTCOMBLB)
{
/* i-atom LJ combination parameters in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float2);
}
else
{
/* i-atom types in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(int);
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int);
}
return shmem;
"\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
"\tShMem: %zu\n",
config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
- config.gridSize[1], plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c,
- config.sharedMemorySize);
+ config.gridSize[1], plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
}
auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
int shmem;
/* i-atom x in shared memory */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float4);
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float4);
/* cj in shared memory, for each warp separately */
shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
"\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
"\tShMem: %zu\n",
config.blockSize[0], config.blockSize[1], config.blockSize[2], config.gridSize[0],
- config.gridSize[1], numSciInPart * c_numClPerSupercl, c_numClPerSupercl,
- plist->na_c, config.sharedMemorySize);
+ config.gridSize[1], numSciInPart * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
}
auto* timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
unsigned int wexcl, imask, mask_ji;
float4 xqbuf;
float3 xi, xj, rv, f_ij, fcj_buf;
- float3 fci_buf[c_numClPerSupercl]; /* i force buffer */
+ float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
nbnxn_sci_t nb_sci;
- /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+ /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+ const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
/*********************************************************************
* Set up shared memory pointers.
/* shmem buffer for i x+q pre-loading */
float4* xqib = (float4*)sm_nextSlotPtr;
- sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
+ sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xqib));
/* shmem buffer for cj, for each warp separately */
int* cjs = (int*)(sm_nextSlotPtr);
# ifndef LJ_COMB
/* shmem buffer for i atom-type pre-loading */
int* atib = (int*)sm_nextSlotPtr;
- sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*atib));
+ sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*atib));
# else
/* shmem buffer for i-atom LJ combination rule parameters */
float2* ljcpib = (float2*)sm_nextSlotPtr;
- sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*ljcpib));
+ sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*ljcpib));
# endif
/*********************************************************************/
if (tidxz == 0)
{
/* Pre-load i-atom x and q into shared memory */
- ci = sci * c_numClPerSupercl + tidxj;
+ ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
ai = ci * c_clSize + tidxi;
float* shiftptr = (float*)&shift_vec[nb_sci.shift];
}
__syncthreads();
- for (i = 0; i < c_numClPerSupercl; i++)
+ for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
fci_buf[i] = make_float3(0.0f);
}
E_el = 0.0f;
# ifdef EXCLUSION_FORCES /* Ewald or RF */
- if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_numClPerSupercl)
+ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
{
/* we have the diagonal: add the charge and LJ self interaction energy term */
- for (i = 0; i < c_numClPerSupercl; i++)
+ for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
# if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
qi = xqib[i * c_clSize + tidxi].w;
# ifdef LJ_EWALD
# if DISABLE_CUDA_TEXTURES
- E_lj += LDG(
- &nbparam.nbfp[atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2]);
+ E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+ * (ntypes + 1) * 2]);
# else
E_lj += tex1Dfetch<float>(
nbparam.nbfp_texobj,
- atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2);
+ atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+ * (ntypes + 1) * 2);
# endif
# endif
}
Tested with up to nvcc 7.5 */
for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
+ if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
{
- mask_ji = (1U << (jm * c_numClPerSupercl));
+ mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
aj = cj * c_clSize + tidxj;
# if !defined PRUNE_NBL
# pragma unroll 8
# endif
- for (i = 0; i < c_numClPerSupercl; i++)
+ for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
if (imask & mask_ji)
{
- ci = sci * c_numClPerSupercl + i; /* i cluster index */
+ ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */
/* all threads load an atom from i cluster ci into shmem! */
xqbuf = xqib[i * c_clSize + tidxi];
# endif /* LJ_COMB */
// Ensure distance do not become so small that r^-12 overflows
- r2 = max(r2, NBNXN_MIN_RSQ);
+ r2 = max(r2, c_nbnxnMinDistanceSquared);
inv_r = rsqrt(r2);
inv_r2 = inv_r * inv_r;
float fshift_buf = 0.0f;
/* reduce i forces */
- for (i = 0; i < c_numClPerSupercl; i++)
+ for (i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
- ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
+ ai = (sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi;
reduce_force_i_warp_shfl(fci_buf[i], f, &fshift_buf, bCalcFshift, tidxj, ai, c_fullWarpMask);
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/* shmem buffer for i x+q pre-loading */
float4* xib = (float4*)sm_nextSlotPtr;
- sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xib));
+ sm_nextSlotPtr += (c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(*xib));
/* shmem buffer for cj, for each warp separately */
int* cjs = (int*)(sm_nextSlotPtr);
if (tidxz == 0)
{
/* Pre-load i-atom x and q into shared memory */
- int ci = sci * c_numClPerSupercl + tidxj;
+ int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj;
int ai = ci * c_clSize + tidxi;
/* We don't need q, but using float4 in shmem avoids bank conflicts.
# pragma unroll 4
for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl)))
+ if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
{
- unsigned int mask_ji = (1U << (jm * c_numClPerSupercl));
+ unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
int cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
int aj = cj * c_clSize + tidxj;
float3 xj = make_float3(tmp.x, tmp.y, tmp.z);
# pragma unroll 8
- for (int i = 0; i < c_numClPerSupercl; i++)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
if (imaskCheck & mask_ji)
{
static const int __device__ c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
/*! \brief Stride in the force accumualation buffer */
static const int __device__ c_fbufStride = c_clSizeSq;
-/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-static const unsigned __device__ superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+/*! \brief i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+static const unsigned __device__ superClInteractionMask =
+ ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
static const float __device__ c_oneSixth = 0.16666667f;
static const float __device__ c_oneTwelveth = 0.08333333f;
/* TODO: consider moving this to kernel_utils */
/* Convenience defines */
-/*! \brief number of clusters per supercluster. */
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
/*! \brief cluster size = number of atoms per cluster. */
-static const int c_clSize = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
/*! \brief Electrostatic CUDA kernel flavors.
*
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/fatalerror.h"
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl,
const nbnxn_atomdata_t* nbat,
vctot = 0;
Vvdwtot = 0;
- if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_numClPerSupercl)
+ if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
{
/* we have the diagonal:
* add the charge self interaction energy term
*/
- for (im = 0; im < c_numClPerSupercl; im++)
+ for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
{
- ci = sci * c_numClPerSupercl + im;
+ ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
for (ic = 0; ic < c_clSize; ic++)
{
ia = ci * c_clSize + ic;
{
cj = nbl->cj4[cj4_ind].cj[jm];
- for (im = 0; im < c_numClPerSupercl; im++)
+ for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
{
/* We're only using the first imask,
* but here imei[1].imask is identical.
*/
- if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_numClPerSupercl + im)) & 1)
+ if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
+ & 1)
{
gmx_bool within_rlist;
- ci = sci * c_numClPerSupercl + im;
+ ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
within_rlist = FALSE;
npair = 0;
c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
int_bit = static_cast<real>(
(excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
- >> (jm * c_numClPerSupercl + im))
+ >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
& 1);
js = ja * nbat->xstride;
}
// Ensure distance do not become so small that r^-12 overflows
- rsq = std::max(rsq, NBNXN_MIN_RSQ);
+ rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
rinv = gmx::invsqrt(rsq);
rinvsq = rinv * rinv;
// Ensure the distances do not fall below the limit where r^-12 overflows.
// This should never happen for normal interactions.
- rsq = std::max(rsq, NBNXN_MIN_RSQ);
+ rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
#ifdef COUNT_PAIRS
npair++;
rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw);
#endif
- minRsq_S = SimdReal(NBNXN_MIN_RSQ);
+ minRsq_S = SimdReal(c_nbnxnMinDistanceSquared);
const real* gmx_restrict q = nbatParams.q.data();
const real facel = ic->epsfac;
rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw);
#endif
- minRsq_S = SimdReal(NBNXN_MIN_RSQ);
+ minRsq_S = SimdReal(c_nbnxnMinDistanceSquared);
const real* gmx_restrict q = nbatParams.q.data();
const real facel = ic->epsfac;
string(REGEX REPLACE ".*=" "" ELEC_NAME "${ELEC_DEF}")
string(REGEX REPLACE ".*=" "" VDW_NAME "${VDW_DEF}")
set(OBJ_FILE nbnxm_ocl_kernel${ELEC_NAME}${VDW_NAME}_${VENDOR}.o)
+ # The constants below duplicate various others (e.g. from pairlist.h)
+ # but as the kernels compiled here are not used for production,
+ # it will be OK if the values would fall out of sync.
add_custom_command(OUTPUT ${OBJ_FILE} COMMAND ${OCL_COMPILER}
${CMAKE_CURRENT_SOURCE_DIR}/nbnxm_ocl_kernels.cl ${CLANG_TIDY_ARGS}
-Xclang -finclude-default-header -D_${VENDOR}_SOURCE_
-DGMX_OCL_FASTGEN ${ELEC_DEF} ${VDW_DEF}
- -DNBNXN_GPU_CLUSTER_SIZE=${CLUSTER_SIZE} -DIATYPE_SHMEM
+ -Dc_nbnxnGpuClusterSize=${CLUSTER_SIZE}
+ -Dc_nbnxnMinDistanceSquared=3.82e-07F
+ -Dc_nbnxnGpuNumClusterPerSupercluster=8
+ -Dc_nbnxnGpuJgroupSize=4
+ -DIATYPE_SHMEM
-c -I ${CMAKE_SOURCE_DIR}/src -std=cl1.2
-Weverything -Wno-conversion -Wno-missing-variable-declarations -Wno-used-but-marked-unused
-Wno-cast-align -Wno-incompatible-pointer-types
/*! \brief Convenience constants */
//@{
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
//@}
/* size of shmem (force-buffers/xq/atom type preloading) */
/* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
/* i-atom x+q in shared memory */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
/* cj in shared memory, for both warps separately
* TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
*/
if (useLjCombRule(vdwType))
{
/* i-atom LJ combination parameters in shared memory */
- shmem += c_numClPerSupercl * c_clSize * 2 * sizeof(float); /* atib abused for ljcp, float2 */
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
+ * sizeof(float); /* atib abused for ljcp, float2 */
}
else
{
/* i-atom types in shared memory */
- shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+ shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
}
}
/* force reduction buffers in shared memory */
"Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
config.blockSize[0], config.blockSize[1], config.blockSize[2],
config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
- plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c);
+ plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
}
fillin_ocl_structures(nbp, &nbparams_params);
int shmem;
/* i-atom x in shared memory (for convenience we load all 4 components including q) */
- shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4;
+ shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
/* cj in shared memory, for each warp separately
* Note: only need to load once per wavefront, but to keep the code simple,
* for now we load twice on AMD.
"\tShMem: %zu\n",
config.blockSize[0], config.blockSize[1], config.blockSize[2],
config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
- plist->nsci * c_numClPerSupercl, c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
+ plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
}
cl_nbparam_params_t nbparams_params;
std::string extraDefines =
makeDefinesForKernelTypes(bFastGen, nb->nbparam->eeltype, nb->nbparam->vdwtype);
- /* Here we pass macros and static const int variables defined
+ /* Here we pass macros and static const/constexpr int variables defined
* in include files outside the opencl as macros, to avoid
- * including those files in the JIT compilation that happens
- * at runtime. This is particularly a problem for headers that
- * depend on config.h, such as pairlist.h. */
+ * including those files in the plain-C JIT compilation that happens
+ * at runtime. */
extraDefines += gmx::formatString(
- " -DNBNXN_GPU_CLUSTER_SIZE=%d "
+ " -Dc_nbnxnGpuClusterSize=%d"
+ " -Dc_nbnxnMinDistanceSquared=%g"
+ " -Dc_nbnxnGpuNumClusterPerSupercluster=%d"
+ " -Dc_nbnxnGpuJgroupSize=%d"
"%s",
- c_nbnxnGpuClusterSize, /* Defined in nbnxn_pairlist.h */
- (nb->bPrefetchLjParam) ? "-DIATYPE_SHMEM" : "");
+ c_nbnxnGpuClusterSize, c_nbnxnMinDistanceSquared, c_nbnxnGpuNumClusterPerSupercluster,
+ c_nbnxnGpuJgroupSize, (nb->bPrefetchLjParam) ? " -DIATYPE_SHMEM" : "");
try
{
/* TODO when we have a proper MPI-aware logging module,
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012-2018, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const int bidx = get_group_id(0);
const int widx = tidx / WARP_SIZE; /* warp index */
- /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
- const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
+ /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+ const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
-#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
+#define LOCAL_OFFSET (xqib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
CjType cjs = 0;
#if USE_CJ_PREFETCH
/* shmem buffer for cj, for both warps separately */
/* shmem buffer for i atom-type pre-loading */
__local int* atib = (__local int*)(LOCAL_OFFSET); //NOLINT(google-readability-casting)
# undef LOCAL_OFFSET
-# define LOCAL_OFFSET (atib + NCL_PER_SUPERCL * CL_SIZE)
+# define LOCAL_OFFSET (atib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
# else
__local float2* ljcpib = (__local float2*)(LOCAL_OFFSET);
# undef LOCAL_OFFSET
-# define LOCAL_OFFSET (ljcpib + NCL_PER_SUPERCL * CL_SIZE)
+# define LOCAL_OFFSET (ljcpib + c_nbnxnGpuNumClusterPerSupercluster * CL_SIZE)
# endif
#endif
const int cij4_start = nb_sci.cj4_ind_start; /* first ...*/
const int cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
- for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE)
{
/* Pre-load i-atom x and q into shared memory */
- const int ci = sci * NCL_PER_SUPERCL + tidxj + i;
+ const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
const int ai = ci * CL_SIZE + tidxi;
float4 xqbuf = xq[ai]
#endif
barrier(CLK_LOCAL_MEM_FENCE);
- float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
- for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ float3 fci_buf[c_nbnxnGpuNumClusterPerSupercluster]; /* i force buffer */
+ for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
{
fci_buf[ci_offset] = (float3)(0.0F);
}
float E_el = 0.0F;
# if defined EXCLUSION_FORCES /* Ewald or RF */
- if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * NCL_PER_SUPERCL)
+ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
{
/* we have the diagonal: add the charge and LJ self interaction energy term */
- for (int i = 0; i < NCL_PER_SUPERCL; i++)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
# if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
const float qi = xqib[i * CL_SIZE + tidxi].w;
E_el += qi * qi;
# endif
# if defined LJ_EWALD
- E_lj += nbfp_climg2d[atom_types[(sci * NCL_PER_SUPERCL + i) * CL_SIZE + tidxi] * (ntypes + 1) * 2];
+ E_lj += nbfp_climg2d[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi]
+ * (ntypes + 1) * 2];
# endif /* LJ_EWALD */
}
#endif
for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
+ if (imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
{
- unsigned int mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+ unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj);
const int aj = cj * CL_SIZE + tidxj;
#if !defined PRUNE_NBL
# pragma unroll 8
#endif
- for (int i = 0; i < NCL_PER_SUPERCL; i++)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
if (imask & mask_ji)
{
- const int gmx_unused ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+ const int gmx_unused ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; /* i cluster index */
/* all threads load an atom from i cluster ci into shmem! */
const float4 xiqbuf = xqib[i * CL_SIZE + tidxi];
# endif /* LJ_COMB_GEOM */
#endif /* LJ_COMB */
- // Ensure distance do not become so small that r^-12 overflows
- r2 = max(r2, NBNXN_MIN_RSQ);
+ // Ensure distance do not become so small that r^-12 overflows.
+ // Cast to float to ensure the correct built-in max() function
+ // is called.
+ r2 = max(r2, (float)c_nbnxnMinDistanceSquared);
const float inv_r = rsqrt(r2);
const float inv_r2 = inv_r * inv_r;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#endif
// TODO move these consts to utils and unify their use with the nonbonded kernels
- const int c_numClPerSupercl = NCL_PER_SUPERCL;
- const int c_clSize = CL_SIZE;
+ const int c_clSize = CL_SIZE;
// TODO pass this value at compile-time as a macro
const int c_nbnxnGpuClusterpairSplit = 2;
- /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
- const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+ /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set */
+ const unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
-#define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
+#define LOCAL_OFFSET (xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize)
/* shmem buffer for i cj pre-loading */
CjType cjs = 0;
#if USE_CJ_PREFETCH
cjs = (((__local int*)(LOCAL_OFFSET)) + tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize);
# undef LOCAL_OFFSET
/* Offset calculated using xib because cjs depends on on tidxz! */
-# define LOCAL_OFFSET \
- (((__local int*)(xib + c_numClPerSupercl * c_clSize)) \
+# define LOCAL_OFFSET \
+ (((__local int*)(xib + c_nbnxnGpuNumClusterPerSupercluster * c_clSize)) \
+ (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize))
#endif
#if !USE_SUBGROUP_ANY
if (tidxz == 0)
{
- for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += CL_SIZE)
{
/* Pre-load i-atom x and q into shared memory */
- const int ci = sci * c_numClPerSupercl + tidxj + i;
+ const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
const int ai = ci * c_clSize + tidxi;
/* We don't need q, but using float4 in shmem avoids bank conflicts */
#pragma unroll 4
for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl)))
+ if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
{
- unsigned int mask_ji = (1U << (jm * c_numClPerSupercl));
+ unsigned int mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
const int cj = loadCj(cjs, pl_cj4[j4].cj, jm, tidxi, tidxj);
const int aj = cj * c_clSize + tidxj;
const float3 xj = (float3)(tmp.xyz);
#pragma unroll 8
- for (int i = 0; i < c_numClPerSupercl; i++)
+ for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
{
if (imaskCheck & mask_ji)
{
#include "gromacs/gpu_utils/device_utils.clh"
#include "gromacs/gpu_utils/vectype_ops.clh"
-#include "gromacs/nbnxm/constants.h"
#include "gromacs/pbcutil/ishift.h"
#include "nbnxm_ocl_consts.h"
-#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
-#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
+#define CL_SIZE (c_nbnxnGpuClusterSize)
#define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
*/
} nbnxn_excl_t;
-/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
-__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
+__constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload,
const __global int* gm_cj,
/* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
* for CL_SIZE>4.*/
float2 fshift_buf = 0;
- for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
{
- int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
float3 fin = fci_buf[ci_offset];
fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
__global float* fshift)
{
float fshift_buf = 0;
- for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
{
- int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
int tidx = tidxi + tidxj * CL_SIZE;
/* store i forces in shmem */
f_buf[tidx] = fci_buf[ci_offset].x;
#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/real.h"
-// This file with constants is separate from this file to be able
-// to include it during OpenCL jitting without including config.h
-#include "constants.h"
#include "pairlistparams.h"
struct NbnxnPairlistCpuWork;
//! \}
//! \}
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * For smaller values we will overflow when calculating r^-1 or r^-12, but
+ * to keep it simple we always apply the limit from the tougher r^-12 condition.
+ */
+#if GMX_DOUBLE
+// Some double precision SIMD architectures use single precision in the first
+// step, so although the double precision criterion would allow smaller rsq,
+// we need to stay in single precision with some margin for the N-R iterations.
+constexpr double c_nbnxnMinDistanceSquared = 1.0e-36;
+#else
+// The worst intermediate value we might evaluate is r^-12, which
+// means we should ensure r^2 stays above pow(GMX_FLOAT_MAX,-1.0/6.0)*1.01 (some margin)
+constexpr float c_nbnxnMinDistanceSquared = 3.82e-07F; // r > 6.2e-4
+#endif
+
+
+//! The number of clusters in a super-cluster, used for GPU
+constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
+
+/*! \brief With GPU kernels we group cluster pairs in 4 to optimize memory usage
+ * of integers containing 32 bits.
+ */
+constexpr int c_nbnxnGpuJgroupSize = (32 / c_nbnxnGpuNumClusterPerSupercluster);
+
/*! \internal
* \brief Simple pair-list i-unit
*/