src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  CUDA non-bonded prune-only kernel.
  39  *
  40  *  Unlike the non-bonded interaction kernels, this is not preprocessor-generated,
  41  *  the two flavors achieved by templating.
  42  *
  43  *  \author Szilárd Páll <pall.szilard@gmail.com>
  44  *  \author Berk Hess <hess@kth.se>
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  50 #include "gromacs/math/utilities.h"
  51 #include "gromacs/pbcutil/ishift.h"
  52
  53 #include "nbnxm_cuda_kernel_utils.cuh"
  54 #include "nbnxm_cuda_types.h"
  55
  56 /* Note that floating-point constants in CUDA code should be suffixed
  57  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  58  * code that is in double precision.
  59  */
  60
  61 /**@{*/
  62 /*! \brief Compute capability dependent definition of kernel launch configuration parameters.
  63  *
  64  * Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
  65  * represents the j-concurrency, hence it determines the number of threads per block.
  66  * It is chosen such that 100% occupancy is maintained (on Maxwell and later for any NTHREAD_Z,
  67  * requires >=4 warp/block, NTHREAD_Z>=2 on Kepler).
  68  *
  69  * Hence, values NTHREAD_Z >= 2 trade inter- for intra-block parallelism
  70  * which has the advantage of lowering the overhead of starting up a block, filling shmem
  71  * and registers, etc. Ideally we'd want to expose as much intra-block work as possible
  72  * As we also split lists to cater for the block-parallelization needed by the register-
  73  * limited non-bonded kernels, for very short j-loops large NTHREAD_Z will cause slowdown
  74  * as it leads to intra-block warp imbalance. Ideally, we'd want to auto-tune the choice
  75  * of NTHREAD_Z, but for now we instead pick a reasonable tradeoff-value.
  76  *
  77  * Note that given the above input size tradeoffs and that performance depends on
  78  * additional factors including GPU arch, #SM's, we'll accept performance tradeoffs
  79  * of using a fixed NTHREAD_Z=4. The following outliers have been observed:
  80  *   - up to 25% faster (rolling) prune kernels with NTHREAD_Z=8 in the regime where lists
  81  *     are not split (much), but the rolling chunks are small;
  82  *   - with large inputs NTHREAD_Z=1 is 2-3% faster (on CC>=5.0)
  83  */
  84 #define NTHREAD_Z           (GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY)
  85 #define THREADS_PER_BLOCK   (c_clSize*c_clSize*NTHREAD_Z)
  86 // we want 100% occupancy, so max threads/block
  87 #define MIN_BLOCKS_PER_MP   (GMX_CUDA_MAX_THREADS_PER_MP/THREADS_PER_BLOCK)
  88 /**@}*/
  89
  90 /*! \brief Nonbonded list pruning kernel.
  91  *
  92  *  The \p haveFreshList template parameter defines the two flavors of the kernel; when
  93  *  true a new list from immediately after pair-list generation is pruned using rlistOuter,
  94  *  the pruned masks are stored in a separate buffer and the outer-list is pruned
  95  *  using the rlistInner distance; when false only the pruning with rlistInner is performed.
  96  *
  97  *  Kernel launch parameters:
  98  *   - #blocks   = #pair lists, blockId = pair list Id
  99  *   - #threads  = NTHREAD_Z * c_clSize^2
 100  *   - shmem     = see nbnxn_cuda.cu:calc_shmem_required_prune()
 101  *
 102  *   Each thread calculates an i-j atom distance..
 103  */
 104 template <bool haveFreshList>
 105 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
 106 __global__ void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat,
 107                                         const cu_nbparam_t  nbparam,
 108                                         const cu_plist_t    plist,
 109                                         int                 numParts,
 110                                         int                 part)
 111 #ifdef FUNCTION_DECLARATION_ONLY
 112 ;     /* Only do function declaration, omit the function body. */
 113
 114 // Add extern declarations so each translation unit understands that
 115 // there will be a definition provided.
 116 extern template
 117 __global__ void
 118 nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const cu_nbparam_t,
 119                               const cu_plist_t, int, int);
 120 extern template
 121 __global__ void
 122 nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const cu_nbparam_t,
 123                                const cu_plist_t, int, int);
 124 #else
 125 {
 126
 127     /* convenience variables */
 128     const nbnxn_sci_t  *pl_sci      = plist.sci;
 129     nbnxn_cj4_t        *pl_cj4      = plist.cj4;
 130     const float4       *xq          = atdat.xq;
 131     const float3       *shift_vec   = atdat.shift_vec;
 132
 133     float               rlistOuter_sq = nbparam.rlistOuter_sq;
 134     float               rlistInner_sq = nbparam.rlistInner_sq;
 135
 136     /* thread/block/warp id-s */
 137     unsigned int tidxi  = threadIdx.x;
 138     unsigned int tidxj  = threadIdx.y;
 139 #if NTHREAD_Z == 1
 140     unsigned int tidxz  = 0;
 141 #else
 142     unsigned int tidxz  = threadIdx.z;
 143 #endif
 144     unsigned int bidx   = blockIdx.x;
 145     unsigned int widx   = (threadIdx.y * c_clSize) / warp_size; /* warp index */
 146
 147     /*********************************************************************
 148      * Set up shared memory pointers.
 149      * sm_nextSlotPtr should always be updated to point to the "next slot",
 150      * that is past the last point where data has been stored.
 151      */
 152     extern __shared__  char sm_dynamicShmem[];
 153     char                   *sm_nextSlotPtr = sm_dynamicShmem;
 154     static_assert(sizeof(char) == 1, "The shared memory offset calculation assumes that char is 1 byte");
 155
 156     /* shmem buffer for i x+q pre-loading */
 157     float4 *xib     = (float4 *)sm_nextSlotPtr;
 158     sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xib));
 159
 160     /* shmem buffer for cj, for each warp separately */
 161     int *cjs        = (int *)(sm_nextSlotPtr);
 162     /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
 163     cjs            += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
 164     sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
 165     /*********************************************************************/
 166
 167
 168     nbnxn_sci_t nb_sci      = pl_sci[bidx*numParts + part]; /* my i super-cluster's index = sciOffset + current bidx * numParts + part */
 169     int         sci         = nb_sci.sci;                   /* super-cluster */
 170     int         cij4_start  = nb_sci.cj4_ind_start;         /* first ...*/
 171     int         cij4_end    = nb_sci.cj4_ind_end;           /* and last index of j clusters */
 172
 173     if (tidxz == 0)
 174     {
 175         /* Pre-load i-atom x and q into shared memory */
 176         int ci = sci * c_numClPerSupercl + tidxj;
 177         int ai = ci * c_clSize + tidxi;
 178
 179         /* We don't need q, but using float4 in shmem avoids bank conflicts.
 180            (but it also wastes L2 bandwidth). */
 181         float4 tmp = xq[ai];
 182         float4 xi  = tmp + shift_vec[nb_sci.shift];
 183         xib[tidxj * c_clSize + tidxi] = xi;
 184     }
 185     __syncthreads();
 186
 187     /* loop over the j clusters = seen by any of the atoms in the current super-cluster;
 188      * The loop stride NTHREAD_Z ensures that consecutive warps-pairs are assigned
 189      * consecutive j4's entries.
 190      */
 191     for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
 192     {
 193         unsigned int imaskFull, imaskCheck, imaskNew;
 194
 195         if (haveFreshList)
 196         {
 197             /* Read the mask from the list transferred from the CPU */
 198             imaskFull = pl_cj4[j4].imei[widx].imask;
 199             /* We attempt to prune all pairs present in the original list */
 200             imaskCheck = imaskFull;
 201             imaskNew   = 0;
 202         }
 203         else
 204         {
 205             /* Read the mask from the "warp-pruned" by rlistOuter mask array */
 206             imaskFull = plist.imask[j4*c_nbnxnGpuClusterpairSplit + widx];
 207             /* Read the old rolling pruned mask, use as a base for new */
 208             imaskNew = pl_cj4[j4].imei[widx].imask;
 209             /* We only need to check pairs with different mask */
 210             imaskCheck = (imaskNew ^ imaskFull);
 211         }
 212
 213         if (imaskCheck)
 214         {
 215             /* Pre-load cj into shared memory on both warps separately */
 216             if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
 217             {
 218                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
 219             }
 220             __syncwarp(c_fullWarpMask);
 221
 222 #pragma unroll 4
 223             for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 224             {
 225                 if (imaskCheck & (superClInteractionMask << (jm * c_numClPerSupercl)))
 226                 {
 227                     unsigned int mask_ji = (1U << (jm * c_numClPerSupercl));
 228
 229                     int          cj      = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize];
 230                     int          aj      = cj * c_clSize + tidxj;
 231
 232                     /* load j atom data */
 233                     float4 tmp  = xq[aj];
 234                     float3 xj   = make_float3(tmp.x, tmp.y, tmp.z);
 235
 236 #pragma unroll 8
 237                     for (int i = 0; i < c_numClPerSupercl; i++)
 238                     {
 239                         if (imaskCheck & mask_ji)
 240                         {
 241                             /* load i-cluster coordinates from shmem */
 242                             float4 xi = xib[i * c_clSize + tidxi];
 243
 244
 245                             /* distance between i and j atoms */
 246                             float3 rv = make_float3(xi.x, xi.y, xi.z) - xj;
 247                             float  r2 = norm2(rv);
 248
 249                             /* If _none_ of the atoms pairs are in rlistOuter
 250                                range, the bit corresponding to the current
 251                                cluster-pair in imask gets set to 0. */
 252                             if (haveFreshList && !__any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
 253                             {
 254                                 imaskFull &= ~mask_ji;
 255                             }
 256                             /* If any atom pair is within range, set the bit
 257                                corresponding to the current cluster-pair. */
 258                             if (__any_sync(c_fullWarpMask, r2 < rlistInner_sq))
 259                             {
 260                                 imaskNew |= mask_ji;
 261                             }
 262                         }
 263
 264                         /* shift the mask bit by 1 */
 265                         mask_ji += mask_ji;
 266                     }
 267                 }
 268             }
 269
 270             if (haveFreshList)
 271             {
 272                 /* copy the list pruned to rlistOuter to a separate buffer */
 273                 plist.imask[j4*c_nbnxnGpuClusterpairSplit + widx] = imaskFull;
 274             }
 275             /* update the imask with only the pairs up to rlistInner */
 276             plist.cj4[j4].imei[widx].imask = imaskNew;
 277         }
 278         // avoid shared memory WAR hazards between loop iterations
 279         __syncwarp(c_fullWarpMask);
 280     }
 281 }
 282 #endif /* FUNCTION_DECLARATION_ONLY */
 283
 284 #undef NTHREAD_Z
 285 #undef MIN_BLOCKS_PER_MP
 286 #undef THREADS_PER_BLOCK