src/gromacs/ewald/pme_solve_sycl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements PME GPU Fourier grid solving in SYCL.
  38  *
  39  *  \author Mark Abraham <mark.j.abraham@gmail.com>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "pme_solve_sycl.h"
  45
  46 #include <cassert>
  47
  48 #include "gromacs/gpu_utils/gmxsycl.h"
  49 #include "gromacs/gpu_utils/sycl_kernel_utils.h"
  50 #include "gromacs/math/units.h"
  51
  52 #include "pme_gpu_constants.h"
  53
  54 using cl::sycl::access::mode;
  55
  56 /*! \brief
  57  * PME complex grid solver kernel function.
  58  *
  59  * \tparam     gridOrdering             Specifies the dimension ordering of the complex grid.
  60  * \tparam     computeEnergyAndVirial   Tells if the reciprocal energy and virial should be
  61  *                                        computed.
  62  * \tparam     subGroupSize             Describes the width of a SYCL subgroup
  63  */
  64 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, int subGroupSize>
  65 auto makeSolveKernel(cl::sycl::handler&                            cgh,
  66                      DeviceAccessor<float, mode::read>             a_splineModuli,
  67                      DeviceAccessor<SolveKernelParams, mode::read> a_solveKernelParams,
  68                      DeviceAccessor<float, mode::read_write>       a_virialAndEnergy,
  69                      DeviceAccessor<float, mode::read_write>       a_fourierGrid)
  70 {
  71     cgh.require(a_splineModuli);
  72     cgh.require(a_solveKernelParams);
  73     cgh.require(a_virialAndEnergy);
  74     cgh.require(a_fourierGrid);
  75
  76     /* Reduce 7 outputs per warp in the shared memory */
  77     const int stride =
  78             8; // this is c_virialAndEnergyCount==7 rounded up to power of 2 for convenience, hence the assert
  79     static_assert(c_virialAndEnergyCount == 7);
  80     const int reductionBufferSize = c_solveMaxWarpsPerBlock * stride;
  81     cl::sycl::accessor<float, 1, mode::read_write, cl::sycl::target::local> sm_virialAndEnergy(
  82             cl::sycl::range<1>(reductionBufferSize), cgh);
  83
  84     /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
  85      * Each block handles up to c_solveMaxWarpsPerBlock * subGroupSize cells -
  86      * depending on the grid contiguous dimension size,
  87      * that can range from a part of a single gridline to several complete gridlines.
  88      */
  89     return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
  90     {
  91         /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
  92         int majorDim, middleDim, minorDim;
  93         switch (gridOrdering)
  94         {
  95             case GridOrdering::YZX:
  96                 majorDim  = YY;
  97                 middleDim = ZZ;
  98                 minorDim  = XX;
  99                 break;
 100
 101             case GridOrdering::XYZ:
 102                 majorDim  = XX;
 103                 middleDim = YY;
 104                 minorDim  = ZZ;
 105                 break;
 106
 107             default: assert(false);
 108         }
 109
 110         /* Global memory pointers */
 111         const float* __restrict__ gm_splineValueMajor =
 112                 a_splineModuli.get_pointer() + a_solveKernelParams[0].splineValuesOffset[majorDim];
 113         const float* __restrict__ gm_splineValueMiddle =
 114                 a_splineModuli.get_pointer() + a_solveKernelParams[0].splineValuesOffset[middleDim];
 115         const float* __restrict__ gm_splineValueMinor =
 116                 a_splineModuli.get_pointer() + a_solveKernelParams[0].splineValuesOffset[minorDim];
 117         // The Fourier grid is allocated as float values, even though
 118         // it logically contains complex values. (It also can be
 119         // the same memory as the real grid for in-place transforms.)
 120         // The buffer underlying the accessor may have a size that is
 121         // larger than the active grid, because it is allocated with
 122         // reallocateDeviceBuffer. The size of that larger-than-needed
 123         // grid can be an odd number of floats, even though actual
 124         // grid code only accesses up to an even number of floats. If
 125         // we would use the reinterpet method of the accessor to
 126         // convert from float to float2, runtime boundary checks can
 127         // fail because of this mismatch. So, we extract the
 128         // underlying global_ptr and use that to construct
 129         // cl::sycl::float2 values when needed.
 130         cl::sycl::global_ptr<float> gm_fourierGrid = a_fourierGrid.get_pointer();
 131
 132         /* Various grid sizes and indices */
 133         const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0;
 134         const int localSizeMinor   = a_solveKernelParams[0].complexGridSizePadded[minorDim];
 135         const int localSizeMiddle  = a_solveKernelParams[0].complexGridSizePadded[middleDim];
 136         const int localCountMiddle = a_solveKernelParams[0].complexGridSize[middleDim];
 137         const int localCountMinor  = a_solveKernelParams[0].complexGridSize[minorDim];
 138         const int nMajor           = a_solveKernelParams[0].realGridSize[majorDim];
 139         const int nMiddle          = a_solveKernelParams[0].realGridSize[middleDim];
 140         const int nMinor           = a_solveKernelParams[0].realGridSize[minorDim];
 141         const int maxkMajor        = (nMajor + 1) / 2;  // X or Y
 142         const int maxkMiddle       = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
 143         const int maxkMinor        = (nMinor + 1) / 2;  // Z or X => only check for YZX
 144
 145         const int threadLocalId     = itemIdx.get_local_linear_id();
 146         const int gridLineSize      = localCountMinor;
 147         const int gridLineIndex     = threadLocalId / gridLineSize;
 148         const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
 149         const int gridLinesPerBlock =
 150                 cl::sycl::max(itemIdx.get_local_range(2) / size_t(gridLineSize), size_t(1));
 151         const int activeWarps = (itemIdx.get_local_range(2) / subGroupSize);
 152         const int indexMinor = itemIdx.get_group(2) * itemIdx.get_local_range(2) + gridLineCellIndex;
 153         const int indexMiddle = itemIdx.get_group(1) * gridLinesPerBlock + gridLineIndex;
 154         const int indexMajor  = itemIdx.get_group(0);
 155
 156         /* Optional outputs */
 157         float energy = 0.0F;
 158         float virxx  = 0.0F;
 159         float virxy  = 0.0F;
 160         float virxz  = 0.0F;
 161         float viryy  = 0.0F;
 162         float viryz  = 0.0F;
 163         float virzz  = 0.0F;
 164
 165         assert(indexMajor < a_solveKernelParams[0].complexGridSize[majorDim]);
 166         if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
 167             & (gridLineIndex < gridLinesPerBlock))
 168         {
 169             /* The offset should be equal to the global thread index for coalesced access */
 170             const int gridThreadIndex =
 171                     (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
 172
 173             const int kMajor = indexMajor + localOffsetMajor;
 174             /* Checking either X in XYZ, or Y in YZX cases */
 175             const float mMajor = (kMajor < maxkMajor) ? kMajor : (kMajor - nMajor);
 176
 177             const int kMiddle = indexMiddle + localOffsetMiddle;
 178             float     mMiddle = kMiddle;
 179             /* Checking Y in XYZ case */
 180             if (gridOrdering == GridOrdering::XYZ)
 181             {
 182                 mMiddle = (kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle);
 183             }
 184             const int kMinor = localOffsetMinor + indexMinor;
 185             float     mMinor = kMinor;
 186             /* Checking X in YZX case */
 187             if (gridOrdering == GridOrdering::YZX)
 188             {
 189                 mMinor = (kMinor < maxkMinor) ? kMinor : (kMinor - nMinor);
 190             }
 191             /* We should skip the k-space point (0,0,0) */
 192             const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);
 193
 194             float mX, mY, mZ;
 195             switch (gridOrdering)
 196             {
 197                 case GridOrdering::YZX:
 198                     mX = mMinor;
 199                     mY = mMajor;
 200                     mZ = mMiddle;
 201                     break;
 202
 203                 case GridOrdering::XYZ:
 204                     mX = mMajor;
 205                     mY = mMiddle;
 206                     mZ = mMinor;
 207                     break;
 208
 209                 default: assert(false);
 210             }
 211
 212             /* 0.5 correction factor for the first and last components of a Z dimension */
 213             float corner_fac = 1.0F;
 214             switch (gridOrdering)
 215             {
 216                 case GridOrdering::YZX:
 217                     if ((kMiddle == 0) | (kMiddle == maxkMiddle))
 218                     {
 219                         corner_fac = 0.5F;
 220                     }
 221                     break;
 222
 223                 case GridOrdering::XYZ:
 224                     if ((kMinor == 0) | (kMinor == maxkMinor))
 225                     {
 226                         corner_fac = 0.5F;
 227                     }
 228                     break;
 229
 230                 default: assert(false);
 231             }
 232
 233             if (notZeroPoint)
 234             {
 235                 const float mhxk = mX * a_solveKernelParams[0].recipBox[XX][XX];
 236                 const float mhyk = mX * a_solveKernelParams[0].recipBox[XX][YY]
 237                                    + mY * a_solveKernelParams[0].recipBox[YY][YY];
 238                 const float mhzk = mX * a_solveKernelParams[0].recipBox[XX][ZZ]
 239                                    + mY * a_solveKernelParams[0].recipBox[YY][ZZ]
 240                                    + mZ * a_solveKernelParams[0].recipBox[ZZ][ZZ];
 241
 242                 const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
 243                 assert(m2k != 0.0F);
 244                 float denom = m2k * float(M_PI) * a_solveKernelParams[0].boxVolume
 245                               * gm_splineValueMajor[kMajor] * gm_splineValueMiddle[kMiddle]
 246                               * gm_splineValueMinor[kMinor];
 247                 assert(sycl_2020::isfinite(denom));
 248                 assert(denom != 0.0F);
 249
 250                 const float tmp1   = cl::sycl::exp(-a_solveKernelParams[0].ewaldFactor * m2k);
 251                 const float etermk = a_solveKernelParams[0].elFactor * tmp1 / denom;
 252
 253                 // sycl::float2::load and store are buggy in hipSYCL,
 254                 // but can probably be used after resolution of
 255                 // https://github.com/illuhad/hipSYCL/issues/647
 256                 cl::sycl::float2 gridValue;
 257                 sycl_2020::loadToVec(
 258                         gridThreadIndex, cl::sycl::global_ptr<const float>(gm_fourierGrid), &gridValue);
 259                 const cl::sycl::float2 oldGridValue = gridValue;
 260                 gridValue *= etermk;
 261                 sycl_2020::storeFromVec(gridValue, gridThreadIndex, gm_fourierGrid);
 262
 263                 if (computeEnergyAndVirial)
 264                 {
 265                     const float tmp1k = 2.0F * cl::sycl::dot(gridValue, oldGridValue);
 266
 267                     float vfactor = (a_solveKernelParams[0].ewaldFactor + 1.0F / m2k) * 2.0F;
 268                     float ets2    = corner_fac * tmp1k;
 269                     energy        = ets2;
 270
 271                     float ets2vf = ets2 * vfactor;
 272
 273                     virxx = ets2vf * mhxk * mhxk - ets2;
 274                     virxy = ets2vf * mhxk * mhyk;
 275                     virxz = ets2vf * mhxk * mhzk;
 276                     viryy = ets2vf * mhyk * mhyk - ets2;
 277                     viryz = ets2vf * mhyk * mhzk;
 278                     virzz = ets2vf * mhzk * mhzk - ets2;
 279                 }
 280             }
 281         }
 282
 283         /* Optional energy/virial reduction */
 284         if (computeEnergyAndVirial)
 285         {
 286             /* A tricky shuffle reduction inspired by reduce_force_j_warp_shfl.
 287              * The idea is to reduce 7 energy/virial components into a single variable (aligned by
 288              * 8). We will reduce everything into virxx.
 289              */
 290
 291             /* We can only reduce warp-wise */
 292             const int width = subGroupSize;
 293             static_assert(subGroupSize >= 8);
 294
 295             sycl_2020::sub_group sg = itemIdx.get_sub_group();
 296
 297             /* Making pair sums */
 298             virxx += sycl_2020::shift_left(sg, virxx, 1);
 299             viryy += sycl_2020::shift_right(sg, viryy, 1);
 300             virzz += sycl_2020::shift_left(sg, virzz, 1);
 301             virxy += sycl_2020::shift_right(sg, virxy, 1);
 302             virxz += sycl_2020::shift_left(sg, virxz, 1);
 303             viryz += sycl_2020::shift_right(sg, viryz, 1);
 304             energy += sycl_2020::shift_left(sg, energy, 1);
 305             if (threadLocalId & 1)
 306             {
 307                 virxx = viryy; // virxx now holds virxx and viryy pair sums
 308                 virzz = virxy; // virzz now holds virzz and virxy pair sums
 309                 virxz = viryz; // virxz now holds virxz and viryz pair sums
 310             }
 311
 312             /* Making quad sums */
 313             virxx += sycl_2020::shift_left(sg, virxx, 2);
 314             virzz += sycl_2020::shift_right(sg, virzz, 2);
 315             virxz += sycl_2020::shift_left(sg, virxz, 2);
 316             energy += sycl_2020::shift_right(sg, energy, 2);
 317             if (threadLocalId & 2)
 318             {
 319                 virxx = virzz; // virxx now holds quad sums of virxx, virxy, virzz and virxy
 320                 virxz = energy; // virxz now holds quad sums of virxz, viryz, energy and unused paddings
 321             }
 322
 323             /* Making octet sums */
 324             virxx += sycl_2020::shift_left(sg, virxx, 4);
 325             virxz += sycl_2020::shift_right(sg, virxz, 4);
 326             if (threadLocalId & 4)
 327             {
 328                 virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings
 329             }
 330
 331             /* We only need to reduce virxx now */
 332 #pragma unroll
 333             for (int delta = 8; delta < width; delta <<= 1)
 334             {
 335                 virxx += sycl_2020::shift_left(sg, virxx, delta);
 336             }
 337             /* Now first 7 threads of each warp have the full output contributions in virxx */
 338
 339             const int  componentIndex      = threadLocalId & (subGroupSize - 1);
 340             const bool validComponentIndex = (componentIndex < c_virialAndEnergyCount);
 341
 342             if (validComponentIndex)
 343             {
 344                 const int warpIndex = threadLocalId / subGroupSize;
 345                 sm_virialAndEnergy[warpIndex * stride + componentIndex] = virxx;
 346             }
 347             itemIdx.barrier(cl::sycl::access::fence_space::local_space);
 348
 349             /* Reduce to the single warp size */
 350             const int targetIndex = threadLocalId;
 351 #pragma unroll
 352             for (int reductionStride = reductionBufferSize >> 1; reductionStride >= subGroupSize;
 353                  reductionStride >>= 1)
 354             {
 355                 const int sourceIndex = targetIndex + reductionStride;
 356                 if ((targetIndex < reductionStride) & (sourceIndex < activeWarps * stride))
 357                 {
 358                     sm_virialAndEnergy[targetIndex] += sm_virialAndEnergy[sourceIndex];
 359                 }
 360                 itemIdx.barrier(cl::sycl::access::fence_space::local_space);
 361             }
 362
 363             /* Now use shuffle again */
 364             /* NOTE: This reduction assumes there are at least 4 warps (asserted).
 365              *       To use fewer warps, add to the conditional:
 366              *       && threadLocalId < activeWarps * stride
 367              */
 368             assert(activeWarps * stride >= subGroupSize);
 369             if (threadLocalId < subGroupSize)
 370             {
 371                 float output = sm_virialAndEnergy[threadLocalId];
 372 #pragma unroll
 373                 for (int delta = stride; delta < subGroupSize; delta <<= 1)
 374                 {
 375                     output += sycl_2020::shift_left(sg, output, delta);
 376                 }
 377                 /* Final output */
 378                 if (validComponentIndex)
 379                 {
 380                     assert(sycl_2020::isfinite(output));
 381                     atomicFetchAdd(a_virialAndEnergy[componentIndex], output);
 382                 }
 383             }
 384         }
 385     };
 386 }
 387
 388 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, int gridIndex, int subGroupSize>
 389 PmeSolveKernel<gridOrdering, computeEnergyAndVirial, gridIndex, subGroupSize>::PmeSolveKernel()
 390 {
 391     reset();
 392 }
 393
 394 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, int gridIndex, int subGroupSize>
 395 void PmeSolveKernel<gridOrdering, computeEnergyAndVirial, gridIndex, subGroupSize>::setArg(size_t argIndex,
 396                                                                                            void* arg)
 397 {
 398     if (argIndex == 0)
 399     {
 400         auto* params = reinterpret_cast<PmeGpuKernelParams*>(arg);
 401
 402         constParams_                             = &params->constants;
 403         gridParams_                              = &params->grid;
 404         solveKernelParams_.ewaldFactor           = params->grid.ewaldFactor;
 405         solveKernelParams_.realGridSize          = params->grid.realGridSize;
 406         solveKernelParams_.complexGridSize       = params->grid.complexGridSize;
 407         solveKernelParams_.complexGridSizePadded = params->grid.complexGridSizePadded;
 408         solveKernelParams_.splineValuesOffset    = params->grid.splineValuesOffset;
 409         solveKernelParams_.recipBox[XX]          = params->current.recipBox[XX];
 410         solveKernelParams_.recipBox[YY]          = params->current.recipBox[YY];
 411         solveKernelParams_.recipBox[ZZ]          = params->current.recipBox[ZZ];
 412         solveKernelParams_.boxVolume             = params->current.boxVolume;
 413         solveKernelParams_.elFactor              = params->constants.elFactor;
 414     }
 415     else
 416     {
 417         GMX_RELEASE_ASSERT(argIndex == 0, "Trying to pass too many args to the solve kernel");
 418     }
 419 }
 420
 421 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, int gridIndex, int subGroupSize>
 422 cl::sycl::event PmeSolveKernel<gridOrdering, computeEnergyAndVirial, gridIndex, subGroupSize>::launch(
 423         const KernelLaunchConfig& config,
 424         const DeviceStream&       deviceStream)
 425 {
 426     GMX_RELEASE_ASSERT(gridParams_, "Can not launch the kernel before setting its args");
 427     GMX_RELEASE_ASSERT(constParams_, "Can not launch the kernel before setting its args");
 428
 429     using KernelNameType = PmeSolveKernel<gridOrdering, computeEnergyAndVirial, gridIndex, subGroupSize>;
 430
 431     // SYCL has different multidimensional layout than OpenCL/CUDA.
 432     const cl::sycl::range<3> localSize{ config.blockSize[2], config.blockSize[1], config.blockSize[0] };
 433     const cl::sycl::range<3> groupRange{ config.gridSize[2], config.gridSize[1], config.gridSize[0] };
 434     const cl::sycl::nd_range<3> range{ groupRange * localSize, localSize };
 435
 436     cl::sycl::queue q = deviceStream.stream();
 437
 438     cl::sycl::buffer<SolveKernelParams, 1> d_solveKernelParams(&solveKernelParams_, 1);
 439     cl::sycl::event                        e = q.submit([&](cl::sycl::handler& cgh) {
 440         auto kernel = makeSolveKernel<gridOrdering, computeEnergyAndVirial, subGroupSize>(
 441                 cgh,
 442                 gridParams_->d_splineModuli[gridIndex],
 443                 d_solveKernelParams,
 444                 constParams_->d_virialAndEnergy[gridIndex],
 445                 gridParams_->d_fourierGrid[gridIndex]);
 446         cgh.parallel_for<KernelNameType>(range, kernel);
 447     });
 448
 449     // Delete set args, so we don't forget to set them before the next launch.
 450     reset();
 451
 452     return e;
 453 }
 454
 455 template<GridOrdering gridOrdering, bool computeEnergyAndVirial, int gridIndex, int subGroupSize>
 456 void PmeSolveKernel<gridOrdering, computeEnergyAndVirial, gridIndex, subGroupSize>::reset()
 457 {
 458     gridParams_  = nullptr;
 459     constParams_ = nullptr;
 460 }
 461
 462 //! Kernel class instantiations
 463 /* Disable the "explicit template instantiation 'PmeSplineAndSpreadKernel<...>' will emit a vtable in every
 464  * translation unit [-Wweak-template-vtables]" warning.
 465  * It is only explicitly instantiated in this translation unit, so we should be safe.
 466  */
 467 #ifdef __clang__
 468 #    pragma clang diagnostic push
 469 #    pragma clang diagnostic ignored "-Wweak-template-vtables"
 470 #endif
 471
 472 #define INSTANTIATE(subGroupSize)                                             \
 473     template class PmeSolveKernel<GridOrdering::XYZ, false, 0, subGroupSize>; \
 474     template class PmeSolveKernel<GridOrdering::XYZ, true, 0, subGroupSize>;  \
 475     template class PmeSolveKernel<GridOrdering::YZX, false, 0, subGroupSize>; \
 476     template class PmeSolveKernel<GridOrdering::YZX, true, 0, subGroupSize>;  \
 477     template class PmeSolveKernel<GridOrdering::XYZ, false, 1, subGroupSize>; \
 478     template class PmeSolveKernel<GridOrdering::XYZ, true, 1, subGroupSize>;  \
 479     template class PmeSolveKernel<GridOrdering::YZX, false, 1, subGroupSize>; \
 480     template class PmeSolveKernel<GridOrdering::YZX, true, 1, subGroupSize>;
 481
 482 #if GMX_SYCL_DPCPP
 483 INSTANTIATE(16);
 484 #elif GMX_SYCL_HIPSYCL
 485 INSTANTIATE(32);
 486 INSTANTIATE(64);
 487 #endif
 488
 489 #ifdef __clang__
 490 #    pragma clang diagnostic pop
 491 #endif