src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  NBNXM SYCL kernels
  39  *
  40  *  \ingroup module_nbnxm
  41  */
  42 #include "gmxpre.h"
  43
  44 #include "nbnxm_sycl_kernel_pruneonly.h"
  45
  46 #include "gromacs/gpu_utils/devicebuffer.h"
  47 #include "gromacs/gpu_utils/gmxsycl.h"
  48 #include "gromacs/utility/template_mp.h"
  49
  50 #include "nbnxm_sycl_kernel_utils.h"
  51 #include "nbnxm_sycl_types.h"
  52
  53 using cl::sycl::access::fence_space;
  54 using cl::sycl::access::mode;
  55 using cl::sycl::access::target;
  56
  57 namespace Nbnxm
  58 {
  59
  60 /*! \brief Prune-only kernel for NBNXM.
  61  *
  62  */
  63 template<bool haveFreshList>
  64 auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
  65                           DeviceAccessor<Float4, mode::read>            a_xq,
  66                           DeviceAccessor<Float3, mode::read>            a_shiftVec,
  67                           DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
  68                           DeviceAccessor<nbnxn_sci_t, mode::read>       a_plistSci,
  69                           DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
  70                           const float rlistOuterSq,
  71                           const float rlistInnerSq,
  72                           const int   numParts,
  73                           const int   part)
  74 {
  75     cgh.require(a_xq);
  76     cgh.require(a_shiftVec);
  77     cgh.require(a_plistCJ4);
  78     cgh.require(a_plistSci);
  79     cgh.require(a_plistIMask);
  80
  81     /* shmem buffer for i x+q pre-loading */
  82     cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
  83             cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
  84
  85     constexpr int warpSize = c_clSize * c_clSize / 2;
  86
  87     /* Somewhat weird behavior inherited from OpenCL.
  88      * With clSize == 4, we use sub_group size of 16 (not enforced in OpenCL implementation, but chosen
  89      * by the IGC compiler), however for data layout we consider it to be 8.
  90      * Setting sub_group size to 8 slows down the prune-only kernel 1.5-2 times.
  91      * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
  92      * but it causes very poor performance.
  93      */
  94     constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
  95
  96     /* Requirements:
  97      * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
  98      * to change). */
  99     return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
 100     {
 101         const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
 102         // thread/block/warp id-s
 103         const unsigned tidxi = localId[0];
 104         const unsigned tidxj = localId[1];
 105         const int      tidx  = tidxj * c_clSize + tidxi;
 106         const unsigned tidxz = localId[2];
 107         const unsigned bidx  = itemIdx.get_group(0);
 108
 109         const sycl_2020::sub_group sg   = itemIdx.get_sub_group();
 110         const unsigned             widx = tidx / warpSize;
 111
 112         // my i super-cluster's index = sciOffset + current bidx * numParts + part
 113         const nbnxn_sci_t nbSci     = a_plistSci[bidx * numParts + part];
 114         const int         sci       = nbSci.sci;           /* super-cluster */
 115         const int         cij4Start = nbSci.cj4_ind_start; /* first ...*/
 116         const int         cij4End   = nbSci.cj4_ind_end;   /* and last index of j clusters */
 117
 118         if (tidxz == 0)
 119         {
 120             for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize)
 121             {
 122                 /* Pre-load i-atom x and q into shared memory */
 123                 const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
 124                 const int ai = ci * c_clSize + tidxi;
 125
 126                 /* We don't need q, but using float4 in shmem avoids bank conflicts.
 127                    (but it also wastes L2 bandwidth). */
 128                 const Float4 xq    = a_xq[ai];
 129                 const Float3 shift = a_shiftVec[nbSci.shift];
 130                 const Float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
 131                 sm_xq[tidxj + i][tidxi] = xi;
 132             }
 133         }
 134         itemIdx.barrier(fence_space::local_space);
 135
 136         /* loop over the j clusters = seen by any of the atoms in the current super-cluster.
 137          * The loop stride c_syclPruneKernelJ4Concurrency ensures that consecutive warps-pairs are
 138          * assigned consecutive j4's entries. */
 139         for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += c_syclPruneKernelJ4Concurrency)
 140         {
 141             unsigned imaskFull, imaskCheck, imaskNew;
 142
 143             if constexpr (haveFreshList)
 144             {
 145                 /* Read the mask from the list transferred from the CPU */
 146                 imaskFull = a_plistCJ4[j4].imei[widx].imask;
 147                 /* We attempt to prune all pairs present in the original list */
 148                 imaskCheck = imaskFull;
 149                 imaskNew   = 0;
 150             }
 151             else
 152             {
 153                 /* Read the mask from the "warp-pruned" by rlistOuter mask array */
 154                 imaskFull = a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx];
 155                 /* Read the old rolling pruned mask, use as a base for new */
 156                 imaskNew = a_plistCJ4[j4].imei[widx].imask;
 157                 /* We only need to check pairs with different mask */
 158                 imaskCheck = (imaskNew ^ imaskFull);
 159             }
 160
 161             if (imaskCheck)
 162             {
 163                 for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 164                 {
 165                     if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
 166                     {
 167                         unsigned mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
 168                         // SYCL-TODO: Reevaluate prefetching methods
 169                         const int cj = a_plistCJ4[j4].cj[jm];
 170                         const int aj = cj * c_clSize + tidxj;
 171
 172                         /* load j atom data */
 173                         const Float4 tmp = a_xq[aj];
 174                         const Float3 xj(tmp[0], tmp[1], tmp[2]);
 175
 176                         for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
 177                         {
 178                             if (imaskCheck & mask_ji)
 179                             {
 180                                 // load i-cluster coordinates from shmem
 181                                 const Float4 xi = sm_xq[i][tidxi];
 182                                 // distance between i and j atoms
 183                                 Float3 rv(xi[0], xi[1], xi[2]);
 184                                 rv -= xj;
 185                                 const float r2 = norm2(rv);
 186
 187                                 /* If _none_ of the atoms pairs are in rlistOuter
 188                                  * range, the bit corresponding to the current
 189                                  * cluster-pair in imask gets set to 0. */
 190                                 if (haveFreshList && !(sycl_2020::group_any_of(sg, r2 < rlistOuterSq)))
 191                                 {
 192                                     imaskFull &= ~mask_ji;
 193                                 }
 194                                 /* If any atom pair is within range, set the bit
 195                                  * corresponding to the current cluster-pair. */
 196                                 if (sycl_2020::group_any_of(sg, r2 < rlistInnerSq))
 197                                 {
 198                                     imaskNew |= mask_ji;
 199                                 }
 200                             } // (imaskCheck & mask_ji)
 201                             /* shift the mask bit by 1 */
 202                             mask_ji += mask_ji;
 203                         } // (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
 204                     } // (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
 205                 } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 206
 207                 if constexpr (haveFreshList)
 208                 {
 209                     /* copy the list pruned to rlistOuter to a separate buffer */
 210                     a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx] = imaskFull;
 211                 }
 212                 /* update the imask with only the pairs up to rlistInner */
 213                 a_plistCJ4[j4].imei[widx].imask = imaskNew;
 214             } // (imaskCheck)
 215         } // for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += c_syclPruneKernelJ4Concurrency)
 216     };
 217 }
 218
 219 // SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
 220 template<bool haveFreshList>
 221 class NbnxmKernelPruneOnlyName;
 222
 223 template<bool haveFreshList, class... Args>
 224 cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
 225                                            const int           numSciInPart,
 226                                            Args&&... args)
 227 {
 228     // Should not be needed for SYCL2020.
 229     using kernelNameType = NbnxmKernelPruneOnlyName<haveFreshList>;
 230
 231     /* Kernel launch config:
 232      * - The thread block dimensions match the size of i-clusters, j-clusters,
 233      *   and j-cluster concurrency, in x, y, and z, respectively.
 234      * - The 1D block-grid contains as many blocks as super-clusters.
 235      */
 236     const unsigned long         numBlocks = numSciInPart;
 237     const cl::sycl::range<3>    blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency };
 238     const cl::sycl::range<3>    globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
 239     const cl::sycl::nd_range<3> range{ globalSize, blockSize };
 240
 241     cl::sycl::queue q = deviceStream.stream();
 242
 243     cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
 244         auto kernel = nbnxmKernelPruneOnly<haveFreshList>(cgh, std::forward<Args>(args)...);
 245         cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
 246     });
 247
 248     return e;
 249 }
 250
 251 template<class... Args>
 252 cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args)
 253 {
 254     return gmx::dispatchTemplatedFunction(
 255             [&](auto haveFreshList_) {
 256                 return launchNbnxmKernelPruneOnly<haveFreshList_>(std::forward<Args>(args)...);
 257             },
 258             haveFreshList);
 259 }
 260
 261 void launchNbnxmKernelPruneOnly(NbnxmGpu*                 nb,
 262                                 const InteractionLocality iloc,
 263                                 const int                 numParts,
 264                                 const int                 part,
 265                                 const int                 numSciInPart)
 266 {
 267     NBAtomDataGpu*      adat          = nb->atdat;
 268     NBParamGpu*         nbp           = nb->nbparam;
 269     gpu_plist*          plist         = nb->plist[iloc];
 270     const bool          haveFreshList = plist->haveFreshList;
 271     const DeviceStream& deviceStream  = *nb->deviceStreams[iloc];
 272
 273     cl::sycl::event e = chooseAndLaunchNbnxmKernelPruneOnly(haveFreshList,
 274                                                             deviceStream,
 275                                                             numSciInPart,
 276                                                             adat->xq,
 277                                                             adat->shiftVec,
 278                                                             plist->cj4,
 279                                                             plist->sci,
 280                                                             plist->imask,
 281                                                             nbp->rlistOuter_sq,
 282                                                             nbp->rlistInner_sq,
 283                                                             numParts,
 284                                                             part);
 285 }
 286
 287 } // namespace Nbnxm