/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \endcond */
/* With GPU kernels the i and j cluster size is 8 atoms */
-static const int c_nbnxnGpuClusterSize = 8;
+static constexpr int c_nbnxnGpuClusterSize = 8;
/* The number of clusters in a super-cluster, used for GPU */
-static const int c_nbnxnGpuNumClusterPerSupercluster = 8;
+static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
/* With GPU kernels we group cluster pairs in 4 to optimize memory usage
* of integers containing 32 bits.
*/
-static const int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
+static constexpr int c_nbnxnGpuJgroupSize = 32/c_nbnxnGpuNumClusterPerSupercluster;
/* In CUDA the number of threads in a warp is 32 and we have cluster pairs
* of 8*8=64 atoms, so it's convenient to store data for cluster pair halves.
*/
-static const int c_nbnxnGpuClusterpairSplit = 2;
+static constexpr int c_nbnxnGpuClusterpairSplit = 2;
/* The fixed size of the exclusion mask array for a half cluster pair */
-static const int c_nbnxnGpuExclSize = c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
+static constexpr int c_nbnxnGpuExclSize = c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
/* A buffer data structure of 64 bytes
* to be placed at the beginning and end of structs
#else /* !GMX_SIMD4_HAVE_REAL */
/* 4-wide SIMD version.
- * A cluster is hard-coded to 8 atoms.
* The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
* Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
*/
- assert(c_nbnxnGpuClusterSize == 8);
+ static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
+ "A cluster is hard-coded to 4/8 atoms.");
Simd4Real rc2_S = Simd4Real(rlist2);
Simd4Real ix_S0 = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
Simd4Real iy_S0 = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
Simd4Real iz_S0 = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
- Simd4Real ix_S1 = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
- Simd4Real iy_S1 = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
- Simd4Real iz_S1 = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
+ Simd4Real ix_S1, iy_S1, iz_S1;
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ ix_S1 = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
+ iy_S1 = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
+ iz_S1 = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
+ }
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
*/
dx_S0 = ix_S0 - jx0_S;
dy_S0 = iy_S0 - jy0_S;
dz_S0 = iz_S0 - jz0_S;
- dx_S1 = ix_S1 - jx0_S;
- dy_S1 = iy_S1 - jy0_S;
- dz_S1 = iz_S1 - jz0_S;
dx_S2 = ix_S0 - jx1_S;
dy_S2 = iy_S0 - jy1_S;
dz_S2 = iz_S0 - jz1_S;
- dx_S3 = ix_S1 - jx1_S;
- dy_S3 = iy_S1 - jy1_S;
- dz_S3 = iz_S1 - jz1_S;
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ dx_S1 = ix_S1 - jx0_S;
+ dy_S1 = iy_S1 - jy0_S;
+ dz_S1 = iz_S1 - jz0_S;
+ dx_S3 = ix_S1 - jx1_S;
+ dy_S3 = iy_S1 - jy1_S;
+ dz_S3 = iz_S1 - jz1_S;
+ }
/* rsq = dx*dx+dy*dy+dz*dz */
rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
- rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
- rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
+ rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
+ }
wco_S0 = (rsq_S0 < rc2_S);
- wco_S1 = (rsq_S1 < rc2_S);
wco_S2 = (rsq_S2 < rc2_S);
- wco_S3 = (rsq_S3 < rc2_S);
-
- wco_any_S01 = wco_S0 || wco_S1;
- wco_any_S23 = wco_S2 || wco_S3;
- wco_any_S = wco_any_S01 || wco_any_S23;
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ wco_S1 = (rsq_S1 < rc2_S);
+ wco_S3 = (rsq_S3 < rc2_S);
+ }
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ wco_any_S01 = wco_S0 || wco_S1;
+ wco_any_S23 = wco_S2 || wco_S3;
+ wco_any_S = wco_any_S01 || wco_any_S23;
+ }
+ else
+ {
+ wco_any_S = wco_S0 || wco_S2;
+ }
if (anyTrue(wco_any_S))
{
/* Here we only set the set self and double pair exclusions */
- assert(c_nbnxnGpuClusterpairSplit == 2);
+ static_assert(c_nbnxnGpuClusterpairSplit == 2, "");
get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
InRange = FALSE;
while (!InRange && jclusterFirst <= jclusterLast)
{
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterFirst, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, jclusterFirst, gridj->bbj);
InRange = FALSE;
while (!InRange && jclusterLast > jclusterFirst)
{
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterLast, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, jclusterLast, gridj->bbj);
InRange = FALSE;
while (!InRange && jclusterFirst <= jclusterLast)
{
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterFirst, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, jclusterFirst, gridj->bbj);
InRange = FALSE;
while (!InRange && jclusterLast > jclusterFirst)
{
-#ifdef NBNXN_SEARCH_BB_SIMD4
+#if NBNXN_SEARCH_BB_SIMD4
d2 = subc_bb_dist2_simd4(0, bb_ci, jclusterLast, gridj->bbj);
#else
d2 = subc_bb_dist2(0, bb_ci, jclusterLast, gridj->bbj);