src/gromacs/ewald/pme_gpu_internal.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief This file contains internal function implementations
  39  * for performing the PME calculations on GPU.
  40  *
  41  * Note that this file is compiled as regular C++ source in OpenCL builds, but
  42  * it is treated as CUDA source in CUDA-enabled GPU builds.
  43  *
  44  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  45  * \ingroup module_ewald
  46  */
  47
  48 #include "gmxpre.h"
  49
  50 #include "pme_gpu_internal.h"
  51
  52 #include "config.h"
  53
  54 #include <list>
  55 #include <memory>
  56 #include <string>
  57
  58 #include "gromacs/ewald/ewald_utils.h"
  59 #include "gromacs/gpu_utils/gpu_utils.h"
  60 #include "gromacs/math/invertmatrix.h"
  61 #include "gromacs/math/units.h"
  62 #include "gromacs/timing/gpu_timing.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66 #include "gromacs/utility/logger.h"
  67 #include "gromacs/utility/stringutil.h"
  68
  69 #if GMX_GPU == GMX_GPU_CUDA
  70 #    include "gromacs/gpu_utils/pmalloc_cuda.h"
  71
  72 #    include "pme.cuh"
  73 #elif GMX_GPU == GMX_GPU_OPENCL
  74 #    include "gromacs/gpu_utils/gmxopencl.h"
  75 #endif
  76
  77 #include "gromacs/ewald/pme.h"
  78
  79 #include "pme_gpu_3dfft.h"
  80 #include "pme_gpu_constants.h"
  81 #include "pme_gpu_program_impl.h"
  82 #include "pme_gpu_timings.h"
  83 #include "pme_gpu_types.h"
  84 #include "pme_gpu_types_host.h"
  85 #include "pme_gpu_types_host_impl.h"
  86 #include "pme_gpu_utils.h"
  87 #include "pme_grid.h"
  88 #include "pme_internal.h"
  89 #include "pme_solve.h"
  90
  91 /*! \brief
  92  * CUDA only
  93  * Atom limit above which it is advantageous to turn on the
  94  * recalcuating of the splines in the gather and using less threads per atom in the spline and spread
  95  */
  96 constexpr int c_pmeGpuPerformanceAtomLimit = 23000;
  97
  98 /*! \internal \brief
  99  * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
 100  *
 101  * \param[in] pmeGpu  The PME GPU structure.
 102  * \returns The pointer to the kernel parameters.
 103  */
 104 static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr(const PmeGpu* pmeGpu)
 105 {
 106     // reinterpret_cast is needed because the derived CUDA structure is not known in this file
 107     auto* kernelParamsPtr = reinterpret_cast<PmeGpuKernelParamsBase*>(pmeGpu->kernelParams.get());
 108     return kernelParamsPtr;
 109 }
 110
 111 int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
 112 {
 113     // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
 114     if (c_usePadding)
 115     {
 116         return c_pmeAtomDataAlignment;
 117     }
 118     else
 119     {
 120         return 0;
 121     }
 122 }
 123
 124 int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
 125 {
 126     if (pmeGpu->settings.useOrderThreadsPerAtom)
 127     {
 128         return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
 129     }
 130     else
 131     {
 132         return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
 133     }
 134 }
 135
 136 void pme_gpu_synchronize(const PmeGpu* pmeGpu)
 137 {
 138     gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
 139 }
 140
 141 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
 142 {
 143     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
 144     allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
 145                          pmeGpu->archSpecific->context);
 146     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
 147 }
 148
 149 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
 150 {
 151     freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
 152     pfree(pmeGpu->staging.h_virialAndEnergy);
 153     pmeGpu->staging.h_virialAndEnergy = nullptr;
 154 }
 155
 156 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
 157 {
 158     clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
 159                            c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
 160 }
 161
 162 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
 163 {
 164     const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
 165                                           pmeGpu->kernelParams->grid.realGridSize[XX]
 166                                                   + pmeGpu->kernelParams->grid.realGridSize[YY] };
 167     memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
 168
 169     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX]
 170                                     + pmeGpu->kernelParams->grid.realGridSize[YY]
 171                                     + pmeGpu->kernelParams->grid.realGridSize[ZZ];
 172     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
 173     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
 174                            &pmeGpu->archSpecific->splineValuesSize,
 175                            &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
 176     if (shouldRealloc)
 177     {
 178         /* Reallocate the host buffer */
 179         pfree(pmeGpu->staging.h_splineModuli);
 180         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli),
 181                 newSplineValuesSize * sizeof(float));
 182     }
 183     for (int i = 0; i < DIM; i++)
 184     {
 185         memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i],
 186                pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
 187     }
 188     /* TODO: pin original buffer instead! */
 189     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
 190                        0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream,
 191                        pmeGpu->settings.transferKind, nullptr);
 192 }
 193
 194 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
 195 {
 196     pfree(pmeGpu->staging.h_splineModuli);
 197     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
 198 }
 199
 200 void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
 201 {
 202     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
 203     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
 204     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
 205                            &pmeGpu->archSpecific->forcesSize,
 206                            &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
 207     pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
 208     pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
 209 }
 210
 211 void pme_gpu_free_forces(const PmeGpu* pmeGpu)
 212 {
 213     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
 214 }
 215
 216 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
 217 {
 218     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
 219     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
 220     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
 221                        DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
 222                        pmeGpu->settings.transferKind, nullptr);
 223 }
 224
 225 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
 226 {
 227     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
 228     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
 229     copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
 230                          DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
 231                          pmeGpu->settings.transferKind, nullptr);
 232 }
 233
 234 void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
 235 {
 236     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
 237     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
 238     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
 239     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
 240                            &pmeGpu->archSpecific->coefficientsSize,
 241                            &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
 242     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
 243                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
 244                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 245     if (c_usePadding)
 246     {
 247         const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
 248         const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
 249         if (paddingCount > 0)
 250         {
 251             clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
 252                                    paddingCount, pmeGpu->archSpecific->pmeStream);
 253         }
 254     }
 255 }
 256
 257 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
 258 {
 259     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
 260 }
 261
 262 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
 263 {
 264     const int    order        = pmeGpu->common->pme_order;
 265     const int    alignment    = pme_gpu_get_atoms_per_warp(pmeGpu);
 266     const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 267     const int    newSplineDataSize = DIM * order * nAtomsPadded;
 268     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
 269     /* Two arrays of the same size */
 270     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
 271     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
 272     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
 273     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
 274                            &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
 275     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
 276                            &pmeGpu->archSpecific->splineDataSize,
 277                            &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
 278     // the host side reallocation
 279     if (shouldRealloc)
 280     {
 281         pfree(pmeGpu->staging.h_theta);
 282         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_theta), newSplineDataSize * sizeof(float));
 283         pfree(pmeGpu->staging.h_dtheta);
 284         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_dtheta), newSplineDataSize * sizeof(float));
 285     }
 286 }
 287
 288 void pme_gpu_free_spline_data(const PmeGpu* pmeGpu)
 289 {
 290     /* Two arrays of the same size */
 291     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
 292     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
 293     pfree(pmeGpu->staging.h_theta);
 294     pfree(pmeGpu->staging.h_dtheta);
 295 }
 296
 297 void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu)
 298 {
 299     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
 300     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
 301     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
 302                            &pmeGpu->archSpecific->gridlineIndicesSize,
 303                            &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
 304     pfree(pmeGpu->staging.h_gridlineIndices);
 305     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
 306 }
 307
 308 void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu)
 309 {
 310     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
 311     pfree(pmeGpu->staging.h_gridlineIndices);
 312 }
 313
 314 void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
 315 {
 316     auto*     kernelParamsPtr = pmeGpu->kernelParams.get();
 317     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX]
 318                                 * kernelParamsPtr->grid.realGridSizePadded[YY]
 319                                 * kernelParamsPtr->grid.realGridSizePadded[ZZ];
 320     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX]
 321                                    * kernelParamsPtr->grid.complexGridSizePadded[YY]
 322                                    * kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
 323     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
 324     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
 325     {
 326         /* 2 separate grids */
 327         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
 328                                &pmeGpu->archSpecific->complexGridSize,
 329                                &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
 330         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
 331                                &pmeGpu->archSpecific->realGridSize,
 332                                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
 333     }
 334     else
 335     {
 336         /* A single buffer so that any grid will fit */
 337         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
 338         reallocateDeviceBuffer(
 339                 &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
 340                 &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
 341         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
 342         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
 343         // the size might get used later for copying the grid
 344     }
 345 }
 346
 347 void pme_gpu_free_grids(const PmeGpu* pmeGpu)
 348 {
 349     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
 350     {
 351         freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
 352     }
 353     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
 354 }
 355
 356 void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
 357 {
 358     clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
 359                            pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream);
 360 }
 361
 362 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
 363 {
 364     pme_gpu_free_fract_shifts(pmeGpu);
 365
 366     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
 367
 368     const int nx                  = kernelParamsPtr->grid.realGridSize[XX];
 369     const int ny                  = kernelParamsPtr->grid.realGridSize[YY];
 370     const int nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
 371     const int cellCount           = c_pmeNeighborUnitcellCount;
 372     const int gridDataOffset[DIM] = { 0, cellCount * nx, cellCount * (nx + ny) };
 373
 374     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
 375
 376     const int newFractShiftsSize = cellCount * (nx + ny + nz);
 377
 378 #if GMX_GPU == GMX_GPU_CUDA
 379     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable, kernelParamsPtr->fractShiftsTableTexture,
 380                          pmeGpu->common->fsh.data(), newFractShiftsSize);
 381
 382     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 383                          kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
 384                          newFractShiftsSize);
 385 #elif GMX_GPU == GMX_GPU_OPENCL
 386     // No dedicated texture routines....
 387     allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
 388                          pmeGpu->archSpecific->context);
 389     allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
 390                          pmeGpu->archSpecific->context);
 391     copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
 392                        newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
 393                        GpuApiCallBehavior::Async, nullptr);
 394     copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
 395                        newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
 396                        GpuApiCallBehavior::Async, nullptr);
 397 #endif
 398 }
 399
 400 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
 401 {
 402     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
 403 #if GMX_GPU == GMX_GPU_CUDA
 404     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
 405                             kernelParamsPtr->fractShiftsTableTexture);
 406     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 407                             kernelParamsPtr->gridlineIndicesTableTexture);
 408 #elif GMX_GPU == GMX_GPU_OPENCL
 409     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
 410     freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
 411 #endif
 412 }
 413
 414 bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
 415 {
 416     return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
 417 }
 418
 419 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
 420 {
 421     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
 422                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 423 }
 424
 425 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
 426 {
 427     copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
 428                          pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream,
 429                          pmeGpu->settings.transferKind, nullptr);
 430     pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
 431 }
 432
 433 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
 434 {
 435     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 436     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 437     const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
 438     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
 439     copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
 440                          pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 441     copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
 442                          pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 443     copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
 444                          0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
 445                          pmeGpu->settings.transferKind, nullptr);
 446 }
 447
 448 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
 449 {
 450     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 451     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 452     const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
 453     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
 454     if (c_usePadding)
 455     {
 456         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
 457         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
 458                                pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream);
 459         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
 460                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
 461                                pmeGpu->archSpecific->pmeStream);
 462         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
 463                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
 464                                pmeGpu->archSpecific->pmeStream);
 465     }
 466     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
 467                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 468     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
 469                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
 470     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
 471                        0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
 472                        pmeGpu->settings.transferKind, nullptr);
 473 }
 474
 475 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
 476 {
 477     pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
 478 }
 479
 480 void pme_gpu_init_internal(PmeGpu* pmeGpu)
 481 {
 482 #if GMX_GPU == GMX_GPU_CUDA
 483     // Prepare to use the device that this PME task was assigned earlier.
 484     // Other entities, such as CUDA timing events, are known to implicitly use the device context.
 485     CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
 486 #endif
 487
 488     /* Allocate the target-specific structures */
 489     pmeGpu->archSpecific.reset(new PmeGpuSpecific());
 490     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
 491
 492     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
 493     /* This should give better performance, according to the cuFFT documentation.
 494      * The performance seems to be the same though.
 495      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
 496      */
 497
 498     // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
 499     pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
 500
 501     // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
 502     if (GMX_GPU == GMX_GPU_CUDA)
 503     {
 504         /* WARNING: CUDA timings are incorrect with multiple streams.
 505          *          This is the main reason why they are disabled by default.
 506          */
 507         // TODO: Consider turning on by default when we can detect nr of streams.
 508         pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
 509     }
 510     else if (GMX_GPU == GMX_GPU_OPENCL)
 511     {
 512         pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 513     }
 514
 515 #if GMX_GPU == GMX_GPU_CUDA
 516     pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
 517 #elif GMX_GPU == GMX_GPU_OPENCL
 518     pmeGpu->maxGridWidthX = INT32_MAX / 2;
 519     // TODO: is there no really global work size limit in OpenCL?
 520 #endif
 521
 522     /* Creating a PME GPU stream:
 523      * - default high priority with CUDA
 524      * - no priorities implemented yet with OpenCL; see #2532
 525      */
 526 #if GMX_GPU == GMX_GPU_CUDA
 527     cudaError_t stat;
 528     int         highest_priority, lowest_priority;
 529     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
 530     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
 531     stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
 532                                         cudaStreamDefault, // cudaStreamNonBlocking,
 533                                         highest_priority);
 534     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
 535 #elif GMX_GPU == GMX_GPU_OPENCL
 536     cl_command_queue_properties queueProperties =
 537             pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
 538     cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
 539     cl_int       clError;
 540     pmeGpu->archSpecific->pmeStream =
 541             clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
 542     if (clError != CL_SUCCESS)
 543     {
 544         GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
 545     }
 546 #endif
 547 }
 548
 549 void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
 550 {
 551 #if GMX_GPU == GMX_GPU_CUDA
 552     /* Destroy the CUDA stream */
 553     cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
 554     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
 555 #elif GMX_GPU == GMX_GPU_OPENCL
 556     cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
 557     if (clError != CL_SUCCESS)
 558     {
 559         gmx_warning("Failed to destroy PME command queue");
 560     }
 561 #endif
 562 }
 563
 564 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
 565 {
 566     if (pme_gpu_settings(pmeGpu).performGPUFFT)
 567     {
 568         pmeGpu->archSpecific->fftSetup.resize(0);
 569         for (int i = 0; i < pmeGpu->common->ngrids; i++)
 570         {
 571             pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu));
 572         }
 573     }
 574 }
 575
 576 void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu)
 577 {
 578     pmeGpu->archSpecific->fftSetup.resize(0);
 579 }
 580
 581 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
 582 {
 583     if (order != c_pmeGpuOrder)
 584     {
 585         throw order;
 586     }
 587     constexpr int fixedOrder = c_pmeGpuOrder;
 588     GMX_UNUSED_VALUE(fixedOrder);
 589
 590     const int atomWarpIndex = atomIndex % atomsPerWarp;
 591     const int warpIndex     = atomIndex / atomsPerWarp;
 592     int       indexBase, result;
 593     switch (atomsPerWarp)
 594     {
 595         case 1:
 596             indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
 597             result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
 598             break;
 599
 600         case 2:
 601             indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
 602             result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
 603             break;
 604
 605         case 4:
 606             indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
 607             result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
 608             break;
 609
 610         case 8:
 611             indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
 612             result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
 613             break;
 614
 615         default:
 616             GMX_THROW(gmx::NotImplementedError(
 617                     gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
 618                                       "getSplineParamFullIndex",
 619                                       atomsPerWarp)));
 620     }
 621     return result;
 622 }
 623
 624 void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
 625 {
 626     const PmeGpu* pmeGpu = pme.gpu;
 627     for (int j = 0; j < c_virialAndEnergyCount; j++)
 628     {
 629         GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[j]),
 630                    "PME GPU produces incorrect energy/virial.");
 631     }
 632
 633     unsigned int j                 = 0;
 634     output->coulombVirial_[XX][XX] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 635     output->coulombVirial_[YY][YY] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 636     output->coulombVirial_[ZZ][ZZ] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 637     output->coulombVirial_[XX][YY] = output->coulombVirial_[YY][XX] =
 638             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 639     output->coulombVirial_[XX][ZZ] = output->coulombVirial_[ZZ][XX] =
 640             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 641     output->coulombVirial_[YY][ZZ] = output->coulombVirial_[ZZ][YY] =
 642             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
 643     output->coulombEnergy_ = 0.5F * pmeGpu->staging.h_virialAndEnergy[j++];
 644 }
 645
 646 /*! \brief Sets the force-related members in \p output
 647  *
 648  * \param[in]   pmeGpu      PME GPU data structure
 649  * \param[out]  output      Pointer to PME output data structure
 650  */
 651 static void pme_gpu_getForceOutput(PmeGpu* pmeGpu, PmeOutput* output)
 652 {
 653     output->haveForceOutput_ = !pmeGpu->settings.useGpuForceReduction;
 654     if (output->haveForceOutput_)
 655     {
 656         output->forces_ = pmeGpu->staging.h_forces;
 657     }
 658 }
 659
 660 PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const int flags)
 661 {
 662     PmeGpu*    pmeGpu                      = pme.gpu;
 663     const bool haveComputedEnergyAndVirial = (flags & GMX_PME_CALC_ENER_VIR) != 0;
 664
 665     PmeOutput output;
 666
 667     pme_gpu_getForceOutput(pmeGpu, &output);
 668
 669     // The caller knows from the flags that the energy and the virial are not usable
 670     // on the else branch
 671     if (haveComputedEnergyAndVirial)
 672     {
 673         if (pme_gpu_settings(pmeGpu).performGPUSolve)
 674         {
 675             pme_gpu_getEnergyAndVirial(pme, &output);
 676         }
 677         else
 678         {
 679             get_pme_ener_vir_q(pme.solve_work, pme.nthread, &output);
 680         }
 681     }
 682     return output;
 683 }
 684
 685 void pme_gpu_update_input_box(PmeGpu gmx_unused* pmeGpu, const matrix gmx_unused box)
 686 {
 687 #if GMX_DOUBLE
 688     GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
 689 #else
 690     matrix scaledBox;
 691     pmeGpu->common->boxScaler->scaleBox(box, scaledBox);
 692     auto* kernelParamsPtr              = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
 693     kernelParamsPtr->current.boxVolume = scaledBox[XX][XX] * scaledBox[YY][YY] * scaledBox[ZZ][ZZ];
 694     GMX_ASSERT(kernelParamsPtr->current.boxVolume != 0.0F, "Zero volume of the unit cell");
 695     matrix recipBox;
 696     gmx::invertBoxMatrix(scaledBox, recipBox);
 697
 698     /* The GPU recipBox is transposed as compared to the CPU recipBox.
 699      * Spread uses matrix columns (while solve and gather use rows).
 700      * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
 701      */
 702     const real newRecipBox[DIM][DIM] = { { recipBox[XX][XX], recipBox[YY][XX], recipBox[ZZ][XX] },
 703                                          { 0.0, recipBox[YY][YY], recipBox[ZZ][YY] },
 704                                          { 0.0, 0.0, recipBox[ZZ][ZZ] } };
 705     memcpy(kernelParamsPtr->current.recipBox, newRecipBox, sizeof(matrix));
 706 #endif
 707 }
 708
 709 /*! \brief \libinternal
 710  * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
 711  *
 712  * \param[in] pmeGpu            The PME GPU structure.
 713  */
 714 static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
 715 {
 716     auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
 717     kernelParamsPtr->grid.ewaldFactor =
 718             (M_PI * M_PI) / (pmeGpu->common->ewaldcoeff_q * pmeGpu->common->ewaldcoeff_q);
 719
 720     /* The grid size variants */
 721     for (int i = 0; i < DIM; i++)
 722     {
 723         kernelParamsPtr->grid.realGridSize[i] = pmeGpu->common->nk[i];
 724         kernelParamsPtr->grid.realGridSizeFP[i] =
 725                 static_cast<float>(kernelParamsPtr->grid.realGridSize[i]);
 726         kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
 727
 728         // The complex grid currently uses no padding;
 729         // if it starts to do so, then another test should be added for that
 730         kernelParamsPtr->grid.complexGridSize[i]       = kernelParamsPtr->grid.realGridSize[i];
 731         kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
 732     }
 733     /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
 734     if (!pme_gpu_settings(pmeGpu).performGPUFFT)
 735     {
 736         // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
 737         kernelParamsPtr->grid.realGridSizePadded[ZZ] =
 738                 (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
 739     }
 740
 741     /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
 742     kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
 743     kernelParamsPtr->grid.complexGridSize[ZZ]++;
 744     kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
 745
 746     pme_gpu_realloc_and_copy_fract_shifts(pmeGpu);
 747     pme_gpu_realloc_and_copy_bspline_values(pmeGpu);
 748     pme_gpu_realloc_grids(pmeGpu);
 749     pme_gpu_reinit_3dfft(pmeGpu);
 750 }
 751
 752 /* Several GPU functions that refer to the CPU PME data live here.
 753  * We would like to keep these away from the GPU-framework specific code for clarity,
 754  * as well as compilation issues with MPI.
 755  */
 756
 757 /*! \brief \libinternal
 758  * Copies everything useful from the PME CPU to the PME GPU structure.
 759  * The goal is to minimize interaction with the PME CPU structure in the GPU code.
 760  *
 761  * \param[in] pme         The PME structure.
 762  */
 763 static void pme_gpu_copy_common_data_from(const gmx_pme_t* pme)
 764 {
 765     /* TODO: Consider refactoring the CPU PME code to use the same structure,
 766      * so that this function becomes 2 lines */
 767     PmeGpu* pmeGpu               = pme->gpu;
 768     pmeGpu->common->ngrids       = pme->ngrids;
 769     pmeGpu->common->epsilon_r    = pme->epsilon_r;
 770     pmeGpu->common->ewaldcoeff_q = pme->ewaldcoeff_q;
 771     pmeGpu->common->nk[XX]       = pme->nkx;
 772     pmeGpu->common->nk[YY]       = pme->nky;
 773     pmeGpu->common->nk[ZZ]       = pme->nkz;
 774     pmeGpu->common->pme_order    = pme->pme_order;
 775     if (pmeGpu->common->pme_order != c_pmeGpuOrder)
 776     {
 777         GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
 778     }
 779     for (int i = 0; i < DIM; i++)
 780     {
 781         pmeGpu->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGpu->common->nk[i]);
 782     }
 783     const int cellCount = c_pmeNeighborUnitcellCount;
 784     pmeGpu->common->fsh.resize(0);
 785     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshx, pme->fshx + cellCount * pme->nkx);
 786     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshy, pme->fshy + cellCount * pme->nky);
 787     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshz, pme->fshz + cellCount * pme->nkz);
 788     pmeGpu->common->nn.resize(0);
 789     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
 790     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
 791     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
 792     pmeGpu->common->runMode       = pme->runMode;
 793     pmeGpu->common->isRankPmeOnly = !pme->bPPnode;
 794     pmeGpu->common->boxScaler     = pme->boxScaler;
 795 }
 796
 797 /*! \libinternal \brief
 798  * uses heuristics to select the best performing PME gather and scatter kernels
 799  *
 800  * \param[in,out] pmeGpu         The PME GPU structure.
 801  */
 802 static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeGpu)
 803 {
 804     if (pmeGpu->kernelParams->atoms.nAtoms > c_pmeGpuPerformanceAtomLimit && (GMX_GPU == GMX_GPU_CUDA))
 805     {
 806         pmeGpu->settings.useOrderThreadsPerAtom = true;
 807         pmeGpu->settings.recalculateSplines     = true;
 808     }
 809     else
 810     {
 811         pmeGpu->settings.useOrderThreadsPerAtom = false;
 812         pmeGpu->settings.recalculateSplines     = false;
 813     }
 814 }
 815
 816
 817 /*! \libinternal \brief
 818  * Initializes the PME GPU data at the beginning of the run.
 819  * TODO: this should become PmeGpu::PmeGpu()
 820  *
 821  * \param[in,out] pme            The PME structure.
 822  * \param[in,out] deviceInfo     The GPU device information structure.
 823  * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
 824  */
 825 static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
 826 {
 827     pme->gpu       = new PmeGpu();
 828     PmeGpu* pmeGpu = pme->gpu;
 829     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
 830     pmeGpu->common = std::make_shared<PmeShared>();
 831
 832     /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
 833     /* A convenience variable. */
 834     pmeGpu->settings.useDecomposition = (pme->nnodes != 1);
 835     /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
 836     pmeGpu->settings.performGPUGather = true;
 837     // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
 838     pmeGpu->settings.useGpuForceReduction = false;
 839
 840     pme_gpu_set_testing(pmeGpu, false);
 841
 842     pmeGpu->deviceInfo = deviceInfo;
 843     GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
 844     pmeGpu->programHandle_ = pmeGpuProgram;
 845
 846     pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
 847
 848     pme_gpu_init_internal(pmeGpu);
 849     pme_gpu_alloc_energy_virial(pmeGpu);
 850
 851     pme_gpu_copy_common_data_from(pme);
 852
 853     GMX_ASSERT(pmeGpu->common->epsilon_r != 0.0F, "PME GPU: bad electrostatic coefficient");
 854
 855     auto* kernelParamsPtr               = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
 856     kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGpu->common->epsilon_r;
 857 }
 858
 859 void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
 860                                         const PmeAtomComm* atc,
 861                                         PmeSplineDataType  type,
 862                                         int                dimIndex,
 863                                         PmeLayoutTransform transform)
 864 {
 865     // The GPU atom spline data is laid out in a different way currently than the CPU one.
 866     // This function converts the data from GPU to CPU layout (in the host memory).
 867     // It is only intended for testing purposes so far.
 868     // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
 869     // performance (e.g. spreading on GPU, gathering on CPU).
 870     GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
 871     const uintmax_t threadIndex  = 0;
 872     const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms;
 873     const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
 874     const auto      pmeOrder     = pmeGpu->common->pme_order;
 875     GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
 876
 877     real*  cpuSplineBuffer;
 878     float* h_splineBuffer;
 879     switch (type)
 880     {
 881         case PmeSplineDataType::Values:
 882             cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
 883             h_splineBuffer  = pmeGpu->staging.h_theta;
 884             break;
 885
 886         case PmeSplineDataType::Derivatives:
 887             cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
 888             h_splineBuffer  = pmeGpu->staging.h_dtheta;
 889             break;
 890
 891         default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
 892     }
 893
 894     for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
 895     {
 896         for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
 897         {
 898             const auto gpuValueIndex =
 899                     getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
 900             const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
 901             GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
 902                        "Atom spline data index out of bounds (while transforming GPU data layout "
 903                        "for host)");
 904             switch (transform)
 905             {
 906                 case PmeLayoutTransform::GpuToHost:
 907                     cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
 908                     break;
 909
 910                 case PmeLayoutTransform::HostToGpu:
 911                     h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
 912                     break;
 913
 914                 default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
 915             }
 916         }
 917     }
 918 }
 919
 920 void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx::IVec* paddedGridSize)
 921 {
 922     GMX_ASSERT(gridSize != nullptr, "");
 923     GMX_ASSERT(paddedGridSize != nullptr, "");
 924     GMX_ASSERT(pmeGpu != nullptr, "");
 925     auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
 926     for (int i = 0; i < DIM; i++)
 927     {
 928         (*gridSize)[i]       = kernelParamsPtr->grid.realGridSize[i];
 929         (*paddedGridSize)[i] = kernelParamsPtr->grid.realGridSizePadded[i];
 930     }
 931 }
 932
 933 void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
 934 {
 935     GMX_ASSERT(pme != nullptr, "Need valid PME object");
 936     if (pme->runMode == PmeRunMode::CPU)
 937     {
 938         GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
 939         return;
 940     }
 941
 942     if (!pme->gpu)
 943     {
 944         /* First-time initialization */
 945         pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
 946     }
 947     else
 948     {
 949         /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
 950         pme_gpu_copy_common_data_from(pme);
 951     }
 952     /* GPU FFT will only get used for a single rank.*/
 953     pme->gpu->settings.performGPUFFT =
 954             (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme->gpu->settings.useDecomposition;
 955     pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
 956
 957     /* Reinit active timers */
 958     pme_gpu_reinit_timings(pme->gpu);
 959
 960     pme_gpu_reinit_grids(pme->gpu);
 961     // Note: if timing the reinit launch overhead becomes more relevant
 962     // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
 963     pme_gpu_reinit_computation(pme, nullptr);
 964     /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
 965      * update for mixed mode on grid switch. TODO: use shared recipbox field.
 966      */
 967     std::memset(pme->gpu->common->previousBox, 0, sizeof(pme->gpu->common->previousBox));
 968 }
 969
 970 void pme_gpu_destroy(PmeGpu* pmeGpu)
 971 {
 972     /* Free lots of data */
 973     pme_gpu_free_energy_virial(pmeGpu);
 974     pme_gpu_free_bspline_values(pmeGpu);
 975     pme_gpu_free_forces(pmeGpu);
 976     pme_gpu_free_coefficients(pmeGpu);
 977     pme_gpu_free_spline_data(pmeGpu);
 978     pme_gpu_free_grid_indices(pmeGpu);
 979     pme_gpu_free_fract_shifts(pmeGpu);
 980     pme_gpu_free_grids(pmeGpu);
 981
 982     pme_gpu_destroy_3dfft(pmeGpu);
 983
 984     /* Free the GPU-framework specific data last */
 985     pme_gpu_destroy_specific(pmeGpu);
 986
 987     delete pmeGpu;
 988 }
 989
 990 void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
 991 {
 992     auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
 993     kernelParamsPtr->atoms.nAtoms = nAtoms;
 994     const int alignment           = pme_gpu_get_atom_data_alignment(pmeGpu);
 995     pmeGpu->nAtomsPadded          = ((nAtoms + alignment - 1) / alignment) * alignment;
 996     const int  nAtomsAlloc        = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
 997     const bool haveToRealloc =
 998             (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
 999     pmeGpu->nAtomsAlloc = nAtomsAlloc;
1000
1001 #if GMX_DOUBLE
1002     GMX_RELEASE_ASSERT(false, "Only single precision supported");
1003     GMX_UNUSED_VALUE(charges);
1004 #else
1005     pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(charges));
1006     /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
1007 #endif
1008
1009     if (haveToRealloc)
1010     {
1011         pme_gpu_realloc_forces(pmeGpu);
1012         pme_gpu_realloc_spline_data(pmeGpu);
1013         pme_gpu_realloc_grid_indices(pmeGpu);
1014     }
1015     pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu);
1016 }
1017
1018 /*! \internal \brief
1019  * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
1020  * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
1021  *
1022  * \param[in] pmeGpu         The PME GPU data structure.
1023  * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
1024  */
1025 static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
1026 {
1027     CommandEvent* timingEvent = nullptr;
1028     if (pme_gpu_timings_enabled(pmeGpu))
1029     {
1030         GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
1031                    "Wrong PME GPU timing event index");
1032         timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
1033     }
1034     return timingEvent;
1035 }
1036
1037 void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
1038 {
1039     int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
1040
1041     pme_gpu_start_timing(pmeGpu, timerId);
1042     pmeGpu->archSpecific->fftSetup[grid_index]->perform3dFft(
1043             dir, pme_gpu_fetch_timing_event(pmeGpu, timerId));
1044     pme_gpu_stop_timing(pmeGpu, timerId);
1045 }
1046
1047 /*! \brief
1048  * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
1049  * to minimize number of unused blocks.
1050  */
1051 std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu* pmeGpu, int blockCount)
1052 {
1053     // How many maximum widths in X do we need (hopefully just one)
1054     const int minRowCount = (blockCount + pmeGpu->maxGridWidthX - 1) / pmeGpu->maxGridWidthX;
1055     // Trying to make things even
1056     const int colCount = (blockCount + minRowCount - 1) / minRowCount;
1057     GMX_ASSERT((colCount * minRowCount - blockCount) >= 0, "pmeGpuCreateGrid: totally wrong");
1058     GMX_ASSERT((colCount * minRowCount - blockCount) < minRowCount,
1059                "pmeGpuCreateGrid: excessive blocks");
1060     return std::pair<int, int>(colCount, minRowCount);
1061 }
1062
1063 /*! \brief
1064  * Returns a pointer to appropriate spline and spread kernel based on the input bool values
1065  *
1066  * \param[in]  pmeGpu                   The PME GPU structure.
1067  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1068  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1069  *
1070  * \return Pointer to CUDA kernel
1071  */
1072 static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal)
1073 {
1074     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1075     if (writeSplinesToGlobal)
1076     {
1077         if (useOrderThreadsPerAtom)
1078         {
1079             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
1080         }
1081         else
1082         {
1083             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
1084         }
1085     }
1086     else
1087     {
1088         if (useOrderThreadsPerAtom)
1089         {
1090             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
1091         }
1092         else
1093         {
1094             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
1095         }
1096     }
1097
1098     return kernelPtr;
1099 }
1100
1101 /*! \brief
1102  * Returns a pointer to appropriate spline kernel based on the input bool values
1103  *
1104  * \param[in]  pmeGpu                   The PME GPU structure.
1105  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1106  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1107  *
1108  * \return Pointer to CUDA kernel
1109  */
1110 static auto selectSplineKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool gmx_unused writeSplinesToGlobal)
1111 {
1112     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1113     GMX_ASSERT(
1114             writeSplinesToGlobal,
1115             "Spline data should always be written to global memory when just calculating splines");
1116
1117     if (useOrderThreadsPerAtom)
1118     {
1119         kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
1120     }
1121     else
1122     {
1123         kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
1124     }
1125     return kernelPtr;
1126 }
1127
1128 /*! \brief
1129  * Returns a pointer to appropriate spread kernel based on the input bool values
1130  *
1131  * \param[in]  pmeGpu                   The PME GPU structure.
1132  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1133  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1134  *
1135  * \return Pointer to CUDA kernel
1136  */
1137 static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal)
1138 {
1139     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1140     if (writeSplinesToGlobal)
1141     {
1142         if (useOrderThreadsPerAtom)
1143         {
1144             kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
1145         }
1146         else
1147         {
1148             kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
1149         }
1150     }
1151     else
1152     {
1153         /* if we are not saving the spline data we need to recalculate it
1154            using the spline and spread Kernel */
1155         if (useOrderThreadsPerAtom)
1156         {
1157             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
1158         }
1159         else
1160         {
1161             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
1162         }
1163     }
1164     return kernelPtr;
1165 }
1166
1167 void pme_gpu_spread(const PmeGpu*         pmeGpu,
1168                     GpuEventSynchronizer* xReadyOnDevice,
1169                     int gmx_unused gridIndex,
1170                     real*          h_grid,
1171                     bool           computeSplines,
1172                     bool           spreadCharges)
1173 {
1174     GMX_ASSERT(computeSplines || spreadCharges,
1175                "PME spline/spread kernel has invalid input (nothing to do)");
1176     const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1177     GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
1178
1179     const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize;
1180
1181     const int order = pmeGpu->common->pme_order;
1182     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
1183     const bool writeGlobal            = pmeGpu->settings.copyAllOutputs;
1184     const bool useOrderThreadsPerAtom = pmeGpu->settings.useOrderThreadsPerAtom;
1185     const bool recalculateSplines     = pmeGpu->settings.recalculateSplines;
1186 #if GMX_GPU == GMX_GPU_OPENCL
1187     GMX_ASSERT(!useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
1188     GMX_ASSERT(!recalculateSplines, "Recalculating splines not supported in OpenCL");
1189 #endif
1190     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1191                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
1192
1193     // TODO: pick smaller block size in runtime if needed
1194     // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
1195     // If doing so, change atomsPerBlock in the kernels as well.
1196     // TODO: test varying block sizes on modern arch-s as well
1197     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
1198     //(for spline data mostly)
1199     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
1200                "inconsistent atom data padding vs. spreading block size");
1201
1202     // Ensure that coordinates are ready on the device before launching spread;
1203     // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
1204     // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
1205     GMX_ASSERT(xReadyOnDevice != nullptr || (GMX_GPU != GMX_GPU_CUDA)
1206                        || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs,
1207                "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
1208     if (xReadyOnDevice)
1209     {
1210         xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
1211     }
1212
1213     const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
1214     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
1215
1216     KernelLaunchConfig config;
1217     config.blockSize[0] = order;
1218     config.blockSize[1] = useOrderThreadsPerAtom ? 1 : order;
1219     config.blockSize[2] = atomsPerBlock;
1220     config.gridSize[0]  = dimGrid.first;
1221     config.gridSize[1]  = dimGrid.second;
1222     config.stream       = pmeGpu->archSpecific->pmeStream;
1223
1224     int                                timingId;
1225     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1226     if (computeSplines)
1227     {
1228         if (spreadCharges)
1229         {
1230             timingId  = gtPME_SPLINEANDSPREAD;
1231             kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1232                                                        writeGlobal || (!recalculateSplines));
1233         }
1234         else
1235         {
1236             timingId  = gtPME_SPLINE;
1237             kernelPtr = selectSplineKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1238                                               writeGlobal || (!recalculateSplines));
1239         }
1240     }
1241     else
1242     {
1243         timingId  = gtPME_SPREAD;
1244         kernelPtr = selectSpreadKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1245                                           writeGlobal || (!recalculateSplines));
1246     }
1247
1248
1249     pme_gpu_start_timing(pmeGpu, timingId);
1250     auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1251 #if c_canEmbedBuffers
1252     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1253 #else
1254     const auto kernelArgs = prepareGpuKernelArguments(
1255             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
1256             &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
1257             &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->grid.d_fractShiftsTable,
1258             &kernelParamsPtr->grid.d_gridlineIndicesTable, &kernelParamsPtr->atoms.d_coefficients,
1259             &kernelParamsPtr->atoms.d_coordinates);
1260 #endif
1261
1262     launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
1263     pme_gpu_stop_timing(pmeGpu, timingId);
1264
1265     const auto& settings    = pmeGpu->settings;
1266     const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
1267     if (copyBackGrid)
1268     {
1269         pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
1270     }
1271     const bool copyBackAtomData =
1272             computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
1273     if (copyBackAtomData)
1274     {
1275         pme_gpu_copy_output_spread_atom_data(pmeGpu);
1276     }
1277 }
1278
1279 void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
1280 {
1281     const auto& settings               = pmeGpu->settings;
1282     const bool  copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
1283
1284     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1285
1286     float* h_gridFloat = reinterpret_cast<float*>(h_grid);
1287     if (copyInputAndOutputGrid)
1288     {
1289         copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
1290                            pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
1291                            pmeGpu->settings.transferKind, nullptr);
1292     }
1293
1294     int majorDim = -1, middleDim = -1, minorDim = -1;
1295     switch (gridOrdering)
1296     {
1297         case GridOrdering::YZX:
1298             majorDim  = YY;
1299             middleDim = ZZ;
1300             minorDim  = XX;
1301             break;
1302
1303         case GridOrdering::XYZ:
1304             majorDim  = XX;
1305             middleDim = YY;
1306             minorDim  = ZZ;
1307             break;
1308
1309         default: GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
1310     }
1311
1312     const int maxBlockSize = pmeGpu->programHandle_->impl_->solveMaxWorkGroupSize;
1313
1314     const int gridLineSize      = pmeGpu->kernelParams->grid.complexGridSize[minorDim];
1315     const int gridLinesPerBlock = std::max(maxBlockSize / gridLineSize, 1);
1316     const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
1317     int       cellsPerBlock;
1318     if (blocksPerGridLine == 1)
1319     {
1320         cellsPerBlock = gridLineSize * gridLinesPerBlock;
1321     }
1322     else
1323     {
1324         cellsPerBlock = (gridLineSize + blocksPerGridLine - 1) / blocksPerGridLine;
1325     }
1326     const int warpSize  = pmeGpu->programHandle_->impl_->warpSize;
1327     const int blockSize = (cellsPerBlock + warpSize - 1) / warpSize * warpSize;
1328
1329     static_assert(GMX_GPU != GMX_GPU_CUDA || c_solveMaxWarpsPerBlock / 2 >= 4,
1330                   "The CUDA solve energy kernels needs at least 4 warps. "
1331                   "Here we launch at least half of the max warps.");
1332
1333     KernelLaunchConfig config;
1334     config.blockSize[0] = blockSize;
1335     config.gridSize[0]  = blocksPerGridLine;
1336     // rounding up to full warps so that shuffle operations produce defined results
1337     config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
1338                          / gridLinesPerBlock;
1339     config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
1340     config.stream      = pmeGpu->archSpecific->pmeStream;
1341
1342     int                                timingId  = gtPME_SOLVE;
1343     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1344     if (gridOrdering == GridOrdering::YZX)
1345     {
1346         kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernel
1347                                            : pmeGpu->programHandle_->impl_->solveYZXKernel;
1348     }
1349     else if (gridOrdering == GridOrdering::XYZ)
1350     {
1351         kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernel
1352                                            : pmeGpu->programHandle_->impl_->solveXYZKernel;
1353     }
1354
1355     pme_gpu_start_timing(pmeGpu, timingId);
1356     auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1357 #if c_canEmbedBuffers
1358     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1359 #else
1360     const auto kernelArgs = prepareGpuKernelArguments(
1361             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli,
1362             &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid);
1363 #endif
1364     launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs);
1365     pme_gpu_stop_timing(pmeGpu, timingId);
1366
1367     if (computeEnergyAndVirial)
1368     {
1369         copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
1370                              &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
1371                              pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
1372     }
1373
1374     if (copyInputAndOutputGrid)
1375     {
1376         copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
1377                              pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
1378                              pmeGpu->settings.transferKind, nullptr);
1379     }
1380 }
1381
1382 /*! \brief
1383  * Returns a pointer to appropriate gather kernel based on the inputvalues
1384  *
1385  * \param[in]  pmeGpu                   The PME GPU structure.
1386  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1387  * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
1388  *
1389  * \return Pointer to CUDA kernel
1390  */
1391 inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool readSplinesFromGlobal)
1392
1393 {
1394     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1395
1396     if (readSplinesFromGlobal)
1397     {
1398         if (useOrderThreadsPerAtom)
1399         {
1400             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
1401         }
1402         else
1403         {
1404             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
1405         }
1406     }
1407     else
1408     {
1409         if (useOrderThreadsPerAtom)
1410         {
1411             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
1412         }
1413         else
1414         {
1415             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
1416         }
1417     }
1418     return kernelPtr;
1419 }
1420
1421
1422 void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
1423 {
1424     const auto& settings = pmeGpu->settings;
1425     if (!settings.performGPUFFT || settings.copyAllOutputs)
1426     {
1427         pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
1428     }
1429
1430     if (settings.copyAllOutputs)
1431     {
1432         pme_gpu_copy_input_gather_atom_data(pmeGpu);
1433     }
1434
1435     /* Set if we have unit tests */
1436     const bool   readGlobal             = pmeGpu->settings.copyAllOutputs;
1437     const size_t blockSize              = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
1438     const bool   useOrderThreadsPerAtom = pmeGpu->settings.useOrderThreadsPerAtom;
1439     const bool   recalculateSplines     = pmeGpu->settings.recalculateSplines;
1440 #if GMX_GPU == GMX_GPU_OPENCL
1441     GMX_ASSERT(!useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
1442     GMX_ASSERT(!recalculateSplines, "Recalculating splines not supported in OpenCL");
1443 #endif
1444     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1445                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
1446
1447     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
1448                "inconsistent atom data padding vs. gathering block size");
1449
1450     const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
1451     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
1452
1453     const int order = pmeGpu->common->pme_order;
1454     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
1455
1456     KernelLaunchConfig config;
1457     config.blockSize[0] = order;
1458     config.blockSize[1] = useOrderThreadsPerAtom ? 1 : order;
1459     config.blockSize[2] = atomsPerBlock;
1460     config.gridSize[0]  = dimGrid.first;
1461     config.gridSize[1]  = dimGrid.second;
1462     config.stream       = pmeGpu->archSpecific->pmeStream;
1463
1464     // TODO test different cache configs
1465
1466     int                                timingId = gtPME_GATHER;
1467     PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
1468             selectGatherKernelPtr(pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines));
1469     // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
1470
1471     pme_gpu_start_timing(pmeGpu, timingId);
1472     auto*       timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1473     const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1474 #if c_canEmbedBuffers
1475     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1476 #else
1477     const auto kernelArgs = prepareGpuKernelArguments(
1478             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients,
1479             &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->atoms.d_theta,
1480             &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
1481             &kernelParamsPtr->atoms.d_forces);
1482 #endif
1483     launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
1484     pme_gpu_stop_timing(pmeGpu, timingId);
1485
1486     if (pmeGpu->settings.useGpuForceReduction)
1487     {
1488         pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
1489     }
1490     else
1491     {
1492         pme_gpu_copy_output_forces(pmeGpu);
1493     }
1494 }
1495
1496 void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
1497 {
1498     if (pmeGpu && pmeGpu->kernelParams)
1499     {
1500         return pmeGpu->kernelParams->atoms.d_forces;
1501     }
1502     else
1503     {
1504         return nullptr;
1505     }
1506 }
1507
1508 void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
1509 {
1510     GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
1511                "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
1512                "initialized.");
1513
1514     GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms),
1515                "The device-side buffer can not be set.");
1516
1517     pmeGpu->kernelParams->atoms.d_coordinates = d_x;
1518 }
1519
1520 void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
1521 {
1522     if (pmeGpu)
1523     {
1524         return static_cast<void*>(&pmeGpu->archSpecific->pmeStream);
1525     }
1526     else
1527     {
1528         return nullptr;
1529     }
1530 }
1531
1532 void* pme_gpu_get_context(const PmeGpu* pmeGpu)
1533 {
1534     if (pmeGpu)
1535     {
1536         return static_cast<void*>(&pmeGpu->archSpecific->context);
1537     }
1538     else
1539     {
1540         return nullptr;
1541     }
1542 }
1543
1544 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
1545 {
1546     if (pmeGpu && pmeGpu->kernelParams)
1547     {
1548         return &pmeGpu->archSpecific->pmeForcesReady;
1549     }
1550     else
1551     {
1552         return nullptr;
1553     }
1554 }