Merge branch release-2020 into master
[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_internal.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  *
38  * \brief This file contains internal function implementations
39  * for performing the PME calculations on GPU.
40  *
41  * Note that this file is compiled as regular C++ source in OpenCL builds, but
42  * it is treated as CUDA source in CUDA-enabled GPU builds.
43  *
44  * \author Aleksei Iupinov <a.yupinov@gmail.com>
45  * \ingroup module_ewald
46  */
47
48 #include "gmxpre.h"
49
50 #include "pme_gpu_internal.h"
51
52 #include "config.h"
53
54 #include <list>
55 #include <memory>
56 #include <string>
57
58 #include "gromacs/ewald/ewald_utils.h"
59 #include "gromacs/gpu_utils/gpu_utils.h"
60 #include "gromacs/math/invertmatrix.h"
61 #include "gromacs/math/units.h"
62 #include "gromacs/timing/gpu_timing.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxassert.h"
66 #include "gromacs/utility/logger.h"
67 #include "gromacs/utility/stringutil.h"
68
69 #if GMX_GPU == GMX_GPU_CUDA
70 #    include "gromacs/gpu_utils/pmalloc_cuda.h"
71
72 #    include "pme.cuh"
73 #elif GMX_GPU == GMX_GPU_OPENCL
74 #    include "gromacs/gpu_utils/gmxopencl.h"
75 #endif
76
77 #include "gromacs/ewald/pme.h"
78
79 #include "pme_gpu_3dfft.h"
80 #include "pme_gpu_constants.h"
81 #include "pme_gpu_program_impl.h"
82 #include "pme_gpu_timings.h"
83 #include "pme_gpu_types.h"
84 #include "pme_gpu_types_host.h"
85 #include "pme_gpu_types_host_impl.h"
86 #include "pme_gpu_utils.h"
87 #include "pme_grid.h"
88 #include "pme_internal.h"
89 #include "pme_solve.h"
90
91 /*! \brief
92  * CUDA only
93  * Atom limit above which it is advantageous to turn on the
94  * recalcuating of the splines in the gather and using less threads per atom in the spline and spread
95  */
96 constexpr int c_pmeGpuPerformanceAtomLimit = 23000;
97
98 /*! \internal \brief
99  * Wrapper for getting a pointer to the plain C++ part of the GPU kernel parameters structure.
100  *
101  * \param[in] pmeGpu  The PME GPU structure.
102  * \returns The pointer to the kernel parameters.
103  */
104 static PmeGpuKernelParamsBase* pme_gpu_get_kernel_params_base_ptr(const PmeGpu* pmeGpu)
105 {
106     // reinterpret_cast is needed because the derived CUDA structure is not known in this file
107     auto* kernelParamsPtr = reinterpret_cast<PmeGpuKernelParamsBase*>(pmeGpu->kernelParams.get());
108     return kernelParamsPtr;
109 }
110
111 int pme_gpu_get_atom_data_alignment(const PmeGpu* /*unused*/)
112 {
113     // TODO: this can be simplified, as c_pmeAtomDataAlignment is now constant
114     if (c_usePadding)
115     {
116         return c_pmeAtomDataAlignment;
117     }
118     else
119     {
120         return 0;
121     }
122 }
123
124 int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu)
125 {
126     if (pmeGpu->settings.useOrderThreadsPerAtom)
127     {
128         return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom;
129     }
130     else
131     {
132         return pmeGpu->programHandle_->impl_->warpSize / c_pmeSpreadGatherThreadsPerAtom;
133     }
134 }
135
136 void pme_gpu_synchronize(const PmeGpu* pmeGpu)
137 {
138     gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
139 }
140
141 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
142 {
143     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
144     allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
145                          pmeGpu->archSpecific->context);
146     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
147 }
148
149 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
150 {
151     freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
152     pfree(pmeGpu->staging.h_virialAndEnergy);
153     pmeGpu->staging.h_virialAndEnergy = nullptr;
154 }
155
156 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
157 {
158     clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
159                            c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
160 }
161
162 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
163 {
164     const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
165                                           pmeGpu->kernelParams->grid.realGridSize[XX]
166                                                   + pmeGpu->kernelParams->grid.realGridSize[YY] };
167     memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
168
169     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX]
170                                     + pmeGpu->kernelParams->grid.realGridSize[YY]
171                                     + pmeGpu->kernelParams->grid.realGridSize[ZZ];
172     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
173     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
174                            &pmeGpu->archSpecific->splineValuesSize,
175                            &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
176     if (shouldRealloc)
177     {
178         /* Reallocate the host buffer */
179         pfree(pmeGpu->staging.h_splineModuli);
180         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli),
181                 newSplineValuesSize * sizeof(float));
182     }
183     for (int i = 0; i < DIM; i++)
184     {
185         memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i],
186                pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
187     }
188     /* TODO: pin original buffer instead! */
189     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
190                        0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream,
191                        pmeGpu->settings.transferKind, nullptr);
192 }
193
194 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
195 {
196     pfree(pmeGpu->staging.h_splineModuli);
197     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
198 }
199
200 void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
201 {
202     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
203     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
204     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
205                            &pmeGpu->archSpecific->forcesSize,
206                            &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
207     pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
208     pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
209 }
210
211 void pme_gpu_free_forces(const PmeGpu* pmeGpu)
212 {
213     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
214 }
215
216 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
217 {
218     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
219     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
220     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
221                        DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
222                        pmeGpu->settings.transferKind, nullptr);
223 }
224
225 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
226 {
227     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
228     float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
229     copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
230                          DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
231                          pmeGpu->settings.transferKind, nullptr);
232 }
233
234 void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
235 {
236     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
237     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
238     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
239     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
240                            &pmeGpu->archSpecific->coefficientsSize,
241                            &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
242     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
243                        const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
244                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
245     if (c_usePadding)
246     {
247         const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
248         const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
249         if (paddingCount > 0)
250         {
251             clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
252                                    paddingCount, pmeGpu->archSpecific->pmeStream);
253         }
254     }
255 }
256
257 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
258 {
259     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
260 }
261
262 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
263 {
264     const int    order        = pmeGpu->common->pme_order;
265     const int    alignment    = pme_gpu_get_atoms_per_warp(pmeGpu);
266     const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
267     const int    newSplineDataSize = DIM * order * nAtomsPadded;
268     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
269     /* Two arrays of the same size */
270     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
271     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
272     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
273     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
274                            &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
275     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
276                            &pmeGpu->archSpecific->splineDataSize,
277                            &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
278     // the host side reallocation
279     if (shouldRealloc)
280     {
281         pfree(pmeGpu->staging.h_theta);
282         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_theta), newSplineDataSize * sizeof(float));
283         pfree(pmeGpu->staging.h_dtheta);
284         pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_dtheta), newSplineDataSize * sizeof(float));
285     }
286 }
287
288 void pme_gpu_free_spline_data(const PmeGpu* pmeGpu)
289 {
290     /* Two arrays of the same size */
291     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
292     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
293     pfree(pmeGpu->staging.h_theta);
294     pfree(pmeGpu->staging.h_dtheta);
295 }
296
297 void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu)
298 {
299     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
300     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
301     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
302                            &pmeGpu->archSpecific->gridlineIndicesSize,
303                            &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
304     pfree(pmeGpu->staging.h_gridlineIndices);
305     pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
306 }
307
308 void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu)
309 {
310     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
311     pfree(pmeGpu->staging.h_gridlineIndices);
312 }
313
314 void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
315 {
316     auto*     kernelParamsPtr = pmeGpu->kernelParams.get();
317     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX]
318                                 * kernelParamsPtr->grid.realGridSizePadded[YY]
319                                 * kernelParamsPtr->grid.realGridSizePadded[ZZ];
320     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX]
321                                    * kernelParamsPtr->grid.complexGridSizePadded[YY]
322                                    * kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
323     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
324     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
325     {
326         /* 2 separate grids */
327         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
328                                &pmeGpu->archSpecific->complexGridSize,
329                                &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
330         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
331                                &pmeGpu->archSpecific->realGridSize,
332                                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
333     }
334     else
335     {
336         /* A single buffer so that any grid will fit */
337         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
338         reallocateDeviceBuffer(
339                 &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
340                 &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
341         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
342         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
343         // the size might get used later for copying the grid
344     }
345 }
346
347 void pme_gpu_free_grids(const PmeGpu* pmeGpu)
348 {
349     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
350     {
351         freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
352     }
353     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
354 }
355
356 void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
357 {
358     clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
359                            pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream);
360 }
361
362 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
363 {
364     pme_gpu_free_fract_shifts(pmeGpu);
365
366     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
367
368     const int nx                  = kernelParamsPtr->grid.realGridSize[XX];
369     const int ny                  = kernelParamsPtr->grid.realGridSize[YY];
370     const int nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
371     const int cellCount           = c_pmeNeighborUnitcellCount;
372     const int gridDataOffset[DIM] = { 0, cellCount * nx, cellCount * (nx + ny) };
373
374     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
375
376     const int newFractShiftsSize = cellCount * (nx + ny + nz);
377
378 #if GMX_GPU == GMX_GPU_CUDA
379     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable, kernelParamsPtr->fractShiftsTableTexture,
380                          pmeGpu->common->fsh.data(), newFractShiftsSize);
381
382     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
383                          kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
384                          newFractShiftsSize);
385 #elif GMX_GPU == GMX_GPU_OPENCL
386     // No dedicated texture routines....
387     allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
388                          pmeGpu->archSpecific->context);
389     allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
390                          pmeGpu->archSpecific->context);
391     copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
392                        newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
393                        GpuApiCallBehavior::Async, nullptr);
394     copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
395                        newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
396                        GpuApiCallBehavior::Async, nullptr);
397 #endif
398 }
399
400 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
401 {
402     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
403 #if GMX_GPU == GMX_GPU_CUDA
404     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
405                             kernelParamsPtr->fractShiftsTableTexture);
406     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
407                             kernelParamsPtr->gridlineIndicesTableTexture);
408 #elif GMX_GPU == GMX_GPU_OPENCL
409     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
410     freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
411 #endif
412 }
413
414 bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
415 {
416     return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
417 }
418
419 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
420 {
421     copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
422                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
423 }
424
425 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
426 {
427     copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
428                          pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream,
429                          pmeGpu->settings.transferKind, nullptr);
430     pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
431 }
432
433 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
434 {
435     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
436     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
437     const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
438     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
439     copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
440                          pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
441     copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
442                          pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
443     copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
444                          0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
445                          pmeGpu->settings.transferKind, nullptr);
446 }
447
448 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
449 {
450     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
451     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
452     const size_t splinesCount    = DIM * nAtomsPadded * pmeGpu->common->pme_order;
453     auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
454     if (c_usePadding)
455     {
456         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
457         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
458                                pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream);
459         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
460                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
461                                pmeGpu->archSpecific->pmeStream);
462         clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
463                                pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
464                                pmeGpu->archSpecific->pmeStream);
465     }
466     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
467                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
468     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
469                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
470     copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
471                        0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
472                        pmeGpu->settings.transferKind, nullptr);
473 }
474
475 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
476 {
477     pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
478 }
479
480 void pme_gpu_init_internal(PmeGpu* pmeGpu)
481 {
482 #if GMX_GPU == GMX_GPU_CUDA
483     // Prepare to use the device that this PME task was assigned earlier.
484     // Other entities, such as CUDA timing events, are known to implicitly use the device context.
485     CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
486 #endif
487
488     /* Allocate the target-specific structures */
489     pmeGpu->archSpecific.reset(new PmeGpuSpecific());
490     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
491
492     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
493     /* This should give better performance, according to the cuFFT documentation.
494      * The performance seems to be the same though.
495      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
496      */
497
498     // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
499     pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
500
501     // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
502     if (GMX_GPU == GMX_GPU_CUDA)
503     {
504         /* WARNING: CUDA timings are incorrect with multiple streams.
505          *          This is the main reason why they are disabled by default.
506          */
507         // TODO: Consider turning on by default when we can detect nr of streams.
508         pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
509     }
510     else if (GMX_GPU == GMX_GPU_OPENCL)
511     {
512         pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
513     }
514
515 #if GMX_GPU == GMX_GPU_CUDA
516     pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
517 #elif GMX_GPU == GMX_GPU_OPENCL
518     pmeGpu->maxGridWidthX = INT32_MAX / 2;
519     // TODO: is there no really global work size limit in OpenCL?
520 #endif
521
522     /* Creating a PME GPU stream:
523      * - default high priority with CUDA
524      * - no priorities implemented yet with OpenCL; see #2532
525      */
526 #if GMX_GPU == GMX_GPU_CUDA
527     cudaError_t stat;
528     int         highest_priority, lowest_priority;
529     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
530     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
531     stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
532                                         cudaStreamDefault, // cudaStreamNonBlocking,
533                                         highest_priority);
534     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
535 #elif GMX_GPU == GMX_GPU_OPENCL
536     cl_command_queue_properties queueProperties =
537             pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
538     cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
539     cl_int       clError;
540     pmeGpu->archSpecific->pmeStream =
541             clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
542     if (clError != CL_SUCCESS)
543     {
544         GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
545     }
546 #endif
547 }
548
549 void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
550 {
551 #if GMX_GPU == GMX_GPU_CUDA
552     /* Destroy the CUDA stream */
553     cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
554     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
555 #elif GMX_GPU == GMX_GPU_OPENCL
556     cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
557     if (clError != CL_SUCCESS)
558     {
559         gmx_warning("Failed to destroy PME command queue");
560     }
561 #endif
562 }
563
564 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
565 {
566     if (pme_gpu_settings(pmeGpu).performGPUFFT)
567     {
568         pmeGpu->archSpecific->fftSetup.resize(0);
569         for (int i = 0; i < pmeGpu->common->ngrids; i++)
570         {
571             pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu));
572         }
573     }
574 }
575
576 void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu)
577 {
578     pmeGpu->archSpecific->fftSetup.resize(0);
579 }
580
581 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
582 {
583     if (order != c_pmeGpuOrder)
584     {
585         throw order;
586     }
587     constexpr int fixedOrder = c_pmeGpuOrder;
588     GMX_UNUSED_VALUE(fixedOrder);
589
590     const int atomWarpIndex = atomIndex % atomsPerWarp;
591     const int warpIndex     = atomIndex / atomsPerWarp;
592     int       indexBase, result;
593     switch (atomsPerWarp)
594     {
595         case 1:
596             indexBase = getSplineParamIndexBase<fixedOrder, 1>(warpIndex, atomWarpIndex);
597             result    = getSplineParamIndex<fixedOrder, 1>(indexBase, dimIndex, splineIndex);
598             break;
599
600         case 2:
601             indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
602             result    = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
603             break;
604
605         case 4:
606             indexBase = getSplineParamIndexBase<fixedOrder, 4>(warpIndex, atomWarpIndex);
607             result    = getSplineParamIndex<fixedOrder, 4>(indexBase, dimIndex, splineIndex);
608             break;
609
610         case 8:
611             indexBase = getSplineParamIndexBase<fixedOrder, 8>(warpIndex, atomWarpIndex);
612             result    = getSplineParamIndex<fixedOrder, 8>(indexBase, dimIndex, splineIndex);
613             break;
614
615         default:
616             GMX_THROW(gmx::NotImplementedError(
617                     gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in "
618                                       "getSplineParamFullIndex",
619                                       atomsPerWarp)));
620     }
621     return result;
622 }
623
624 void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
625 {
626     const PmeGpu* pmeGpu = pme.gpu;
627     for (int j = 0; j < c_virialAndEnergyCount; j++)
628     {
629         GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[j]),
630                    "PME GPU produces incorrect energy/virial.");
631     }
632
633     unsigned int j                 = 0;
634     output->coulombVirial_[XX][XX] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
635     output->coulombVirial_[YY][YY] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
636     output->coulombVirial_[ZZ][ZZ] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
637     output->coulombVirial_[XX][YY] = output->coulombVirial_[YY][XX] =
638             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
639     output->coulombVirial_[XX][ZZ] = output->coulombVirial_[ZZ][XX] =
640             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
641     output->coulombVirial_[YY][ZZ] = output->coulombVirial_[ZZ][YY] =
642             0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
643     output->coulombEnergy_ = 0.5F * pmeGpu->staging.h_virialAndEnergy[j++];
644 }
645
646 /*! \brief Sets the force-related members in \p output
647  *
648  * \param[in]   pmeGpu      PME GPU data structure
649  * \param[out]  output      Pointer to PME output data structure
650  */
651 static void pme_gpu_getForceOutput(PmeGpu* pmeGpu, PmeOutput* output)
652 {
653     output->haveForceOutput_ = !pmeGpu->settings.useGpuForceReduction;
654     if (output->haveForceOutput_)
655     {
656         output->forces_ = pmeGpu->staging.h_forces;
657     }
658 }
659
660 PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const int flags)
661 {
662     PmeGpu*    pmeGpu                      = pme.gpu;
663     const bool haveComputedEnergyAndVirial = (flags & GMX_PME_CALC_ENER_VIR) != 0;
664
665     PmeOutput output;
666
667     pme_gpu_getForceOutput(pmeGpu, &output);
668
669     // The caller knows from the flags that the energy and the virial are not usable
670     // on the else branch
671     if (haveComputedEnergyAndVirial)
672     {
673         if (pme_gpu_settings(pmeGpu).performGPUSolve)
674         {
675             pme_gpu_getEnergyAndVirial(pme, &output);
676         }
677         else
678         {
679             get_pme_ener_vir_q(pme.solve_work, pme.nthread, &output);
680         }
681     }
682     return output;
683 }
684
685 void pme_gpu_update_input_box(PmeGpu gmx_unused* pmeGpu, const matrix gmx_unused box)
686 {
687 #if GMX_DOUBLE
688     GMX_THROW(gmx::NotImplementedError("PME is implemented for single-precision only on GPU"));
689 #else
690     matrix scaledBox;
691     pmeGpu->common->boxScaler->scaleBox(box, scaledBox);
692     auto* kernelParamsPtr              = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
693     kernelParamsPtr->current.boxVolume = scaledBox[XX][XX] * scaledBox[YY][YY] * scaledBox[ZZ][ZZ];
694     GMX_ASSERT(kernelParamsPtr->current.boxVolume != 0.0F, "Zero volume of the unit cell");
695     matrix recipBox;
696     gmx::invertBoxMatrix(scaledBox, recipBox);
697
698     /* The GPU recipBox is transposed as compared to the CPU recipBox.
699      * Spread uses matrix columns (while solve and gather use rows).
700      * There is no particular reason for this; it might be further rethought/optimized for better access patterns.
701      */
702     const real newRecipBox[DIM][DIM] = { { recipBox[XX][XX], recipBox[YY][XX], recipBox[ZZ][XX] },
703                                          { 0.0, recipBox[YY][YY], recipBox[ZZ][YY] },
704                                          { 0.0, 0.0, recipBox[ZZ][ZZ] } };
705     memcpy(kernelParamsPtr->current.recipBox, newRecipBox, sizeof(matrix));
706 #endif
707 }
708
709 /*! \brief \libinternal
710  * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
711  *
712  * \param[in] pmeGpu            The PME GPU structure.
713  */
714 static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
715 {
716     auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
717     kernelParamsPtr->grid.ewaldFactor =
718             (M_PI * M_PI) / (pmeGpu->common->ewaldcoeff_q * pmeGpu->common->ewaldcoeff_q);
719
720     /* The grid size variants */
721     for (int i = 0; i < DIM; i++)
722     {
723         kernelParamsPtr->grid.realGridSize[i] = pmeGpu->common->nk[i];
724         kernelParamsPtr->grid.realGridSizeFP[i] =
725                 static_cast<float>(kernelParamsPtr->grid.realGridSize[i]);
726         kernelParamsPtr->grid.realGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
727
728         // The complex grid currently uses no padding;
729         // if it starts to do so, then another test should be added for that
730         kernelParamsPtr->grid.complexGridSize[i]       = kernelParamsPtr->grid.realGridSize[i];
731         kernelParamsPtr->grid.complexGridSizePadded[i] = kernelParamsPtr->grid.realGridSize[i];
732     }
733     /* FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
734     if (!pme_gpu_settings(pmeGpu).performGPUFFT)
735     {
736         // This allows for GPU spreading grid and CPU fftgrid to have the same layout, so that we can copy the data directly
737         kernelParamsPtr->grid.realGridSizePadded[ZZ] =
738                 (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
739     }
740
741     /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
742     kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
743     kernelParamsPtr->grid.complexGridSize[ZZ]++;
744     kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
745
746     pme_gpu_realloc_and_copy_fract_shifts(pmeGpu);
747     pme_gpu_realloc_and_copy_bspline_values(pmeGpu);
748     pme_gpu_realloc_grids(pmeGpu);
749     pme_gpu_reinit_3dfft(pmeGpu);
750 }
751
752 /* Several GPU functions that refer to the CPU PME data live here.
753  * We would like to keep these away from the GPU-framework specific code for clarity,
754  * as well as compilation issues with MPI.
755  */
756
757 /*! \brief \libinternal
758  * Copies everything useful from the PME CPU to the PME GPU structure.
759  * The goal is to minimize interaction with the PME CPU structure in the GPU code.
760  *
761  * \param[in] pme         The PME structure.
762  */
763 static void pme_gpu_copy_common_data_from(const gmx_pme_t* pme)
764 {
765     /* TODO: Consider refactoring the CPU PME code to use the same structure,
766      * so that this function becomes 2 lines */
767     PmeGpu* pmeGpu               = pme->gpu;
768     pmeGpu->common->ngrids       = pme->ngrids;
769     pmeGpu->common->epsilon_r    = pme->epsilon_r;
770     pmeGpu->common->ewaldcoeff_q = pme->ewaldcoeff_q;
771     pmeGpu->common->nk[XX]       = pme->nkx;
772     pmeGpu->common->nk[YY]       = pme->nky;
773     pmeGpu->common->nk[ZZ]       = pme->nkz;
774     pmeGpu->common->pme_order    = pme->pme_order;
775     if (pmeGpu->common->pme_order != c_pmeGpuOrder)
776     {
777         GMX_THROW(gmx::NotImplementedError("pme_order != 4 is not implemented!"));
778     }
779     for (int i = 0; i < DIM; i++)
780     {
781         pmeGpu->common->bsp_mod[i].assign(pme->bsp_mod[i], pme->bsp_mod[i] + pmeGpu->common->nk[i]);
782     }
783     const int cellCount = c_pmeNeighborUnitcellCount;
784     pmeGpu->common->fsh.resize(0);
785     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshx, pme->fshx + cellCount * pme->nkx);
786     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshy, pme->fshy + cellCount * pme->nky);
787     pmeGpu->common->fsh.insert(pmeGpu->common->fsh.end(), pme->fshz, pme->fshz + cellCount * pme->nkz);
788     pmeGpu->common->nn.resize(0);
789     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnx, pme->nnx + cellCount * pme->nkx);
790     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nny, pme->nny + cellCount * pme->nky);
791     pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
792     pmeGpu->common->runMode       = pme->runMode;
793     pmeGpu->common->isRankPmeOnly = !pme->bPPnode;
794     pmeGpu->common->boxScaler     = pme->boxScaler;
795 }
796
797 /*! \libinternal \brief
798  * uses heuristics to select the best performing PME gather and scatter kernels
799  *
800  * \param[in,out] pmeGpu         The PME GPU structure.
801  */
802 static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeGpu)
803 {
804     if (pmeGpu->kernelParams->atoms.nAtoms > c_pmeGpuPerformanceAtomLimit && (GMX_GPU == GMX_GPU_CUDA))
805     {
806         pmeGpu->settings.useOrderThreadsPerAtom = true;
807         pmeGpu->settings.recalculateSplines     = true;
808     }
809     else
810     {
811         pmeGpu->settings.useOrderThreadsPerAtom = false;
812         pmeGpu->settings.recalculateSplines     = false;
813     }
814 }
815
816
817 /*! \libinternal \brief
818  * Initializes the PME GPU data at the beginning of the run.
819  * TODO: this should become PmeGpu::PmeGpu()
820  *
821  * \param[in,out] pme            The PME structure.
822  * \param[in,out] deviceInfo     The GPU device information structure.
823  * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
824  */
825 static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
826 {
827     pme->gpu       = new PmeGpu();
828     PmeGpu* pmeGpu = pme->gpu;
829     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
830     pmeGpu->common = std::make_shared<PmeShared>();
831
832     /* These settings are set here for the whole run; dynamic ones are set in pme_gpu_reinit() */
833     /* A convenience variable. */
834     pmeGpu->settings.useDecomposition = (pme->nnodes != 1);
835     /* TODO: CPU gather with GPU spread is broken due to different theta/dtheta layout. */
836     pmeGpu->settings.performGPUGather = true;
837     // By default GPU-side reduction is off (explicitly set here for tests, otherwise reset per-step)
838     pmeGpu->settings.useGpuForceReduction = false;
839
840     pme_gpu_set_testing(pmeGpu, false);
841
842     pmeGpu->deviceInfo = deviceInfo;
843     GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
844     pmeGpu->programHandle_ = pmeGpuProgram;
845
846     pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
847
848     pme_gpu_init_internal(pmeGpu);
849     pme_gpu_alloc_energy_virial(pmeGpu);
850
851     pme_gpu_copy_common_data_from(pme);
852
853     GMX_ASSERT(pmeGpu->common->epsilon_r != 0.0F, "PME GPU: bad electrostatic coefficient");
854
855     auto* kernelParamsPtr               = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
856     kernelParamsPtr->constants.elFactor = ONE_4PI_EPS0 / pmeGpu->common->epsilon_r;
857 }
858
859 void pme_gpu_transform_spline_atom_data(const PmeGpu*      pmeGpu,
860                                         const PmeAtomComm* atc,
861                                         PmeSplineDataType  type,
862                                         int                dimIndex,
863                                         PmeLayoutTransform transform)
864 {
865     // The GPU atom spline data is laid out in a different way currently than the CPU one.
866     // This function converts the data from GPU to CPU layout (in the host memory).
867     // It is only intended for testing purposes so far.
868     // Ideally we should use similar layouts on CPU and GPU if we care about mixed modes and their
869     // performance (e.g. spreading on GPU, gathering on CPU).
870     GMX_RELEASE_ASSERT(atc->nthread == 1, "Only the serial PME data layout is supported");
871     const uintmax_t threadIndex  = 0;
872     const auto      atomCount    = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->atoms.nAtoms;
873     const auto      atomsPerWarp = pme_gpu_get_atoms_per_warp(pmeGpu);
874     const auto      pmeOrder     = pmeGpu->common->pme_order;
875     GMX_ASSERT(pmeOrder == c_pmeGpuOrder, "Only PME order 4 is implemented");
876
877     real*  cpuSplineBuffer;
878     float* h_splineBuffer;
879     switch (type)
880     {
881         case PmeSplineDataType::Values:
882             cpuSplineBuffer = atc->spline[threadIndex].theta.coefficients[dimIndex];
883             h_splineBuffer  = pmeGpu->staging.h_theta;
884             break;
885
886         case PmeSplineDataType::Derivatives:
887             cpuSplineBuffer = atc->spline[threadIndex].dtheta.coefficients[dimIndex];
888             h_splineBuffer  = pmeGpu->staging.h_dtheta;
889             break;
890
891         default: GMX_THROW(gmx::InternalError("Unknown spline data type"));
892     }
893
894     for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
895     {
896         for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
897         {
898             const auto gpuValueIndex =
899                     getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
900             const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
901             GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder,
902                        "Atom spline data index out of bounds (while transforming GPU data layout "
903                        "for host)");
904             switch (transform)
905             {
906                 case PmeLayoutTransform::GpuToHost:
907                     cpuSplineBuffer[cpuValueIndex] = h_splineBuffer[gpuValueIndex];
908                     break;
909
910                 case PmeLayoutTransform::HostToGpu:
911                     h_splineBuffer[gpuValueIndex] = cpuSplineBuffer[cpuValueIndex];
912                     break;
913
914                 default: GMX_THROW(gmx::InternalError("Unknown layout transform"));
915             }
916         }
917     }
918 }
919
920 void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx::IVec* paddedGridSize)
921 {
922     GMX_ASSERT(gridSize != nullptr, "");
923     GMX_ASSERT(paddedGridSize != nullptr, "");
924     GMX_ASSERT(pmeGpu != nullptr, "");
925     auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
926     for (int i = 0; i < DIM; i++)
927     {
928         (*gridSize)[i]       = kernelParamsPtr->grid.realGridSize[i];
929         (*paddedGridSize)[i] = kernelParamsPtr->grid.realGridSizePadded[i];
930     }
931 }
932
933 void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
934 {
935     GMX_ASSERT(pme != nullptr, "Need valid PME object");
936     if (pme->runMode == PmeRunMode::CPU)
937     {
938         GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
939         return;
940     }
941
942     if (!pme->gpu)
943     {
944         /* First-time initialization */
945         pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
946     }
947     else
948     {
949         /* After this call nothing in the GPU code should refer to the gmx_pme_t *pme itself - until the next pme_gpu_reinit */
950         pme_gpu_copy_common_data_from(pme);
951     }
952     /* GPU FFT will only get used for a single rank.*/
953     pme->gpu->settings.performGPUFFT =
954             (pme->gpu->common->runMode == PmeRunMode::GPU) && !pme->gpu->settings.useDecomposition;
955     pme->gpu->settings.performGPUSolve = (pme->gpu->common->runMode == PmeRunMode::GPU);
956
957     /* Reinit active timers */
958     pme_gpu_reinit_timings(pme->gpu);
959
960     pme_gpu_reinit_grids(pme->gpu);
961     // Note: if timing the reinit launch overhead becomes more relevant
962     // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
963     pme_gpu_reinit_computation(pme, nullptr);
964     /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
965      * update for mixed mode on grid switch. TODO: use shared recipbox field.
966      */
967     std::memset(pme->gpu->common->previousBox, 0, sizeof(pme->gpu->common->previousBox));
968 }
969
970 void pme_gpu_destroy(PmeGpu* pmeGpu)
971 {
972     /* Free lots of data */
973     pme_gpu_free_energy_virial(pmeGpu);
974     pme_gpu_free_bspline_values(pmeGpu);
975     pme_gpu_free_forces(pmeGpu);
976     pme_gpu_free_coefficients(pmeGpu);
977     pme_gpu_free_spline_data(pmeGpu);
978     pme_gpu_free_grid_indices(pmeGpu);
979     pme_gpu_free_fract_shifts(pmeGpu);
980     pme_gpu_free_grids(pmeGpu);
981
982     pme_gpu_destroy_3dfft(pmeGpu);
983
984     /* Free the GPU-framework specific data last */
985     pme_gpu_destroy_specific(pmeGpu);
986
987     delete pmeGpu;
988 }
989
990 void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
991 {
992     auto* kernelParamsPtr         = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
993     kernelParamsPtr->atoms.nAtoms = nAtoms;
994     const int alignment           = pme_gpu_get_atom_data_alignment(pmeGpu);
995     pmeGpu->nAtomsPadded          = ((nAtoms + alignment - 1) / alignment) * alignment;
996     const int  nAtomsAlloc        = c_usePadding ? pmeGpu->nAtomsPadded : nAtoms;
997     const bool haveToRealloc =
998             (pmeGpu->nAtomsAlloc < nAtomsAlloc); /* This check might be redundant, but is logical */
999     pmeGpu->nAtomsAlloc = nAtomsAlloc;
1000
1001 #if GMX_DOUBLE
1002     GMX_RELEASE_ASSERT(false, "Only single precision supported");
1003     GMX_UNUSED_VALUE(charges);
1004 #else
1005     pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(charges));
1006     /* Could also be checked for haveToRealloc, but the copy always needs to be performed */
1007 #endif
1008
1009     if (haveToRealloc)
1010     {
1011         pme_gpu_realloc_forces(pmeGpu);
1012         pme_gpu_realloc_spline_data(pmeGpu);
1013         pme_gpu_realloc_grid_indices(pmeGpu);
1014     }
1015     pme_gpu_select_best_performing_pme_spreadgather_kernels(pmeGpu);
1016 }
1017
1018 /*! \internal \brief
1019  * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
1020  * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
1021  *
1022  * \param[in] pmeGpu         The PME GPU data structure.
1023  * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
1024  */
1025 static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
1026 {
1027     CommandEvent* timingEvent = nullptr;
1028     if (pme_gpu_timings_enabled(pmeGpu))
1029     {
1030         GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
1031                    "Wrong PME GPU timing event index");
1032         timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
1033     }
1034     return timingEvent;
1035 }
1036
1037 void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
1038 {
1039     int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
1040
1041     pme_gpu_start_timing(pmeGpu, timerId);
1042     pmeGpu->archSpecific->fftSetup[grid_index]->perform3dFft(
1043             dir, pme_gpu_fetch_timing_event(pmeGpu, timerId));
1044     pme_gpu_stop_timing(pmeGpu, timerId);
1045 }
1046
1047 /*! \brief
1048  * Given possibly large \p blockCount, returns a compact 1D or 2D grid for kernel scheduling,
1049  * to minimize number of unused blocks.
1050  */
1051 std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu* pmeGpu, int blockCount)
1052 {
1053     // How many maximum widths in X do we need (hopefully just one)
1054     const int minRowCount = (blockCount + pmeGpu->maxGridWidthX - 1) / pmeGpu->maxGridWidthX;
1055     // Trying to make things even
1056     const int colCount = (blockCount + minRowCount - 1) / minRowCount;
1057     GMX_ASSERT((colCount * minRowCount - blockCount) >= 0, "pmeGpuCreateGrid: totally wrong");
1058     GMX_ASSERT((colCount * minRowCount - blockCount) < minRowCount,
1059                "pmeGpuCreateGrid: excessive blocks");
1060     return std::pair<int, int>(colCount, minRowCount);
1061 }
1062
1063 /*! \brief
1064  * Returns a pointer to appropriate spline and spread kernel based on the input bool values
1065  *
1066  * \param[in]  pmeGpu                   The PME GPU structure.
1067  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1068  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1069  *
1070  * \return Pointer to CUDA kernel
1071  */
1072 static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal)
1073 {
1074     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1075     if (writeSplinesToGlobal)
1076     {
1077         if (useOrderThreadsPerAtom)
1078         {
1079             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
1080         }
1081         else
1082         {
1083             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
1084         }
1085     }
1086     else
1087     {
1088         if (useOrderThreadsPerAtom)
1089         {
1090             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
1091         }
1092         else
1093         {
1094             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
1095         }
1096     }
1097
1098     return kernelPtr;
1099 }
1100
1101 /*! \brief
1102  * Returns a pointer to appropriate spline kernel based on the input bool values
1103  *
1104  * \param[in]  pmeGpu                   The PME GPU structure.
1105  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1106  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1107  *
1108  * \return Pointer to CUDA kernel
1109  */
1110 static auto selectSplineKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool gmx_unused writeSplinesToGlobal)
1111 {
1112     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1113     GMX_ASSERT(
1114             writeSplinesToGlobal,
1115             "Spline data should always be written to global memory when just calculating splines");
1116
1117     if (useOrderThreadsPerAtom)
1118     {
1119         kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
1120     }
1121     else
1122     {
1123         kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
1124     }
1125     return kernelPtr;
1126 }
1127
1128 /*! \brief
1129  * Returns a pointer to appropriate spread kernel based on the input bool values
1130  *
1131  * \param[in]  pmeGpu                   The PME GPU structure.
1132  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1133  * \param[in]  writeSplinesToGlobal     bool controlling if we should write spline data to global memory
1134  *
1135  * \return Pointer to CUDA kernel
1136  */
1137 static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool writeSplinesToGlobal)
1138 {
1139     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1140     if (writeSplinesToGlobal)
1141     {
1142         if (useOrderThreadsPerAtom)
1143         {
1144             kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
1145         }
1146         else
1147         {
1148             kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
1149         }
1150     }
1151     else
1152     {
1153         /* if we are not saving the spline data we need to recalculate it
1154            using the spline and spread Kernel */
1155         if (useOrderThreadsPerAtom)
1156         {
1157             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
1158         }
1159         else
1160         {
1161             kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
1162         }
1163     }
1164     return kernelPtr;
1165 }
1166
1167 void pme_gpu_spread(const PmeGpu*         pmeGpu,
1168                     GpuEventSynchronizer* xReadyOnDevice,
1169                     int gmx_unused gridIndex,
1170                     real*          h_grid,
1171                     bool           computeSplines,
1172                     bool           spreadCharges)
1173 {
1174     GMX_ASSERT(computeSplines || spreadCharges,
1175                "PME spline/spread kernel has invalid input (nothing to do)");
1176     const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1177     GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
1178
1179     const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize;
1180
1181     const int order = pmeGpu->common->pme_order;
1182     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
1183     const bool writeGlobal            = pmeGpu->settings.copyAllOutputs;
1184     const bool useOrderThreadsPerAtom = pmeGpu->settings.useOrderThreadsPerAtom;
1185     const bool recalculateSplines     = pmeGpu->settings.recalculateSplines;
1186 #if GMX_GPU == GMX_GPU_OPENCL
1187     GMX_ASSERT(!useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
1188     GMX_ASSERT(!recalculateSplines, "Recalculating splines not supported in OpenCL");
1189 #endif
1190     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1191                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
1192
1193     // TODO: pick smaller block size in runtime if needed
1194     // (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
1195     // If doing so, change atomsPerBlock in the kernels as well.
1196     // TODO: test varying block sizes on modern arch-s as well
1197     // TODO: also consider using cudaFuncSetCacheConfig() for preferring shared memory on older architectures
1198     //(for spline data mostly)
1199     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
1200                "inconsistent atom data padding vs. spreading block size");
1201
1202     // Ensure that coordinates are ready on the device before launching spread;
1203     // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
1204     // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
1205     GMX_ASSERT(xReadyOnDevice != nullptr || (GMX_GPU != GMX_GPU_CUDA)
1206                        || pmeGpu->common->isRankPmeOnly || pme_gpu_settings(pmeGpu).copyAllOutputs,
1207                "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
1208     if (xReadyOnDevice)
1209     {
1210         xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
1211     }
1212
1213     const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
1214     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
1215
1216     KernelLaunchConfig config;
1217     config.blockSize[0] = order;
1218     config.blockSize[1] = useOrderThreadsPerAtom ? 1 : order;
1219     config.blockSize[2] = atomsPerBlock;
1220     config.gridSize[0]  = dimGrid.first;
1221     config.gridSize[1]  = dimGrid.second;
1222     config.stream       = pmeGpu->archSpecific->pmeStream;
1223
1224     int                                timingId;
1225     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1226     if (computeSplines)
1227     {
1228         if (spreadCharges)
1229         {
1230             timingId  = gtPME_SPLINEANDSPREAD;
1231             kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1232                                                        writeGlobal || (!recalculateSplines));
1233         }
1234         else
1235         {
1236             timingId  = gtPME_SPLINE;
1237             kernelPtr = selectSplineKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1238                                               writeGlobal || (!recalculateSplines));
1239         }
1240     }
1241     else
1242     {
1243         timingId  = gtPME_SPREAD;
1244         kernelPtr = selectSpreadKernelPtr(pmeGpu, useOrderThreadsPerAtom,
1245                                           writeGlobal || (!recalculateSplines));
1246     }
1247
1248
1249     pme_gpu_start_timing(pmeGpu, timingId);
1250     auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1251 #if c_canEmbedBuffers
1252     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1253 #else
1254     const auto kernelArgs = prepareGpuKernelArguments(
1255             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
1256             &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
1257             &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->grid.d_fractShiftsTable,
1258             &kernelParamsPtr->grid.d_gridlineIndicesTable, &kernelParamsPtr->atoms.d_coefficients,
1259             &kernelParamsPtr->atoms.d_coordinates);
1260 #endif
1261
1262     launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
1263     pme_gpu_stop_timing(pmeGpu, timingId);
1264
1265     const auto& settings    = pmeGpu->settings;
1266     const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
1267     if (copyBackGrid)
1268     {
1269         pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
1270     }
1271     const bool copyBackAtomData =
1272             computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
1273     if (copyBackAtomData)
1274     {
1275         pme_gpu_copy_output_spread_atom_data(pmeGpu);
1276     }
1277 }
1278
1279 void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
1280 {
1281     const auto& settings               = pmeGpu->settings;
1282     const bool  copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
1283
1284     auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1285
1286     float* h_gridFloat = reinterpret_cast<float*>(h_grid);
1287     if (copyInputAndOutputGrid)
1288     {
1289         copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
1290                            pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
1291                            pmeGpu->settings.transferKind, nullptr);
1292     }
1293
1294     int majorDim = -1, middleDim = -1, minorDim = -1;
1295     switch (gridOrdering)
1296     {
1297         case GridOrdering::YZX:
1298             majorDim  = YY;
1299             middleDim = ZZ;
1300             minorDim  = XX;
1301             break;
1302
1303         case GridOrdering::XYZ:
1304             majorDim  = XX;
1305             middleDim = YY;
1306             minorDim  = ZZ;
1307             break;
1308
1309         default: GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
1310     }
1311
1312     const int maxBlockSize = pmeGpu->programHandle_->impl_->solveMaxWorkGroupSize;
1313
1314     const int gridLineSize      = pmeGpu->kernelParams->grid.complexGridSize[minorDim];
1315     const int gridLinesPerBlock = std::max(maxBlockSize / gridLineSize, 1);
1316     const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
1317     int       cellsPerBlock;
1318     if (blocksPerGridLine == 1)
1319     {
1320         cellsPerBlock = gridLineSize * gridLinesPerBlock;
1321     }
1322     else
1323     {
1324         cellsPerBlock = (gridLineSize + blocksPerGridLine - 1) / blocksPerGridLine;
1325     }
1326     const int warpSize  = pmeGpu->programHandle_->impl_->warpSize;
1327     const int blockSize = (cellsPerBlock + warpSize - 1) / warpSize * warpSize;
1328
1329     static_assert(GMX_GPU != GMX_GPU_CUDA || c_solveMaxWarpsPerBlock / 2 >= 4,
1330                   "The CUDA solve energy kernels needs at least 4 warps. "
1331                   "Here we launch at least half of the max warps.");
1332
1333     KernelLaunchConfig config;
1334     config.blockSize[0] = blockSize;
1335     config.gridSize[0]  = blocksPerGridLine;
1336     // rounding up to full warps so that shuffle operations produce defined results
1337     config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
1338                          / gridLinesPerBlock;
1339     config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
1340     config.stream      = pmeGpu->archSpecific->pmeStream;
1341
1342     int                                timingId  = gtPME_SOLVE;
1343     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1344     if (gridOrdering == GridOrdering::YZX)
1345     {
1346         kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernel
1347                                            : pmeGpu->programHandle_->impl_->solveYZXKernel;
1348     }
1349     else if (gridOrdering == GridOrdering::XYZ)
1350     {
1351         kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernel
1352                                            : pmeGpu->programHandle_->impl_->solveXYZKernel;
1353     }
1354
1355     pme_gpu_start_timing(pmeGpu, timingId);
1356     auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1357 #if c_canEmbedBuffers
1358     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1359 #else
1360     const auto kernelArgs = prepareGpuKernelArguments(
1361             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli,
1362             &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid);
1363 #endif
1364     launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs);
1365     pme_gpu_stop_timing(pmeGpu, timingId);
1366
1367     if (computeEnergyAndVirial)
1368     {
1369         copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
1370                              &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
1371                              pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
1372     }
1373
1374     if (copyInputAndOutputGrid)
1375     {
1376         copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
1377                              pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
1378                              pmeGpu->settings.transferKind, nullptr);
1379     }
1380 }
1381
1382 /*! \brief
1383  * Returns a pointer to appropriate gather kernel based on the inputvalues
1384  *
1385  * \param[in]  pmeGpu                   The PME GPU structure.
1386  * \param[in]  useOrderThreadsPerAtom   bool controlling if we should use order or order*order threads per atom
1387  * \param[in]  readSplinesFromGlobal    bool controlling if we should write spline data to global memory
1388  *
1389  * \return Pointer to CUDA kernel
1390  */
1391 inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, bool useOrderThreadsPerAtom, bool readSplinesFromGlobal)
1392
1393 {
1394     PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
1395
1396     if (readSplinesFromGlobal)
1397     {
1398         if (useOrderThreadsPerAtom)
1399         {
1400             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
1401         }
1402         else
1403         {
1404             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
1405         }
1406     }
1407     else
1408     {
1409         if (useOrderThreadsPerAtom)
1410         {
1411             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
1412         }
1413         else
1414         {
1415             kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
1416         }
1417     }
1418     return kernelPtr;
1419 }
1420
1421
1422 void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
1423 {
1424     const auto& settings = pmeGpu->settings;
1425     if (!settings.performGPUFFT || settings.copyAllOutputs)
1426     {
1427         pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
1428     }
1429
1430     if (settings.copyAllOutputs)
1431     {
1432         pme_gpu_copy_input_gather_atom_data(pmeGpu);
1433     }
1434
1435     /* Set if we have unit tests */
1436     const bool   readGlobal             = pmeGpu->settings.copyAllOutputs;
1437     const size_t blockSize              = pmeGpu->programHandle_->impl_->gatherWorkGroupSize;
1438     const bool   useOrderThreadsPerAtom = pmeGpu->settings.useOrderThreadsPerAtom;
1439     const bool   recalculateSplines     = pmeGpu->settings.recalculateSplines;
1440 #if GMX_GPU == GMX_GPU_OPENCL
1441     GMX_ASSERT(!useOrderThreadsPerAtom, "Only 16 threads per atom supported in OpenCL");
1442     GMX_ASSERT(!recalculateSplines, "Recalculating splines not supported in OpenCL");
1443 #endif
1444     const int atomsPerBlock = useOrderThreadsPerAtom ? blockSize / c_pmeSpreadGatherThreadsPerAtom4ThPerAtom
1445                                                      : blockSize / c_pmeSpreadGatherThreadsPerAtom;
1446
1447     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock),
1448                "inconsistent atom data padding vs. gathering block size");
1449
1450     const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
1451     auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
1452
1453     const int order = pmeGpu->common->pme_order;
1454     GMX_ASSERT(order == c_pmeGpuOrder, "Only PME order 4 is implemented");
1455
1456     KernelLaunchConfig config;
1457     config.blockSize[0] = order;
1458     config.blockSize[1] = useOrderThreadsPerAtom ? 1 : order;
1459     config.blockSize[2] = atomsPerBlock;
1460     config.gridSize[0]  = dimGrid.first;
1461     config.gridSize[1]  = dimGrid.second;
1462     config.stream       = pmeGpu->archSpecific->pmeStream;
1463
1464     // TODO test different cache configs
1465
1466     int                                timingId = gtPME_GATHER;
1467     PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
1468             selectGatherKernelPtr(pmeGpu, useOrderThreadsPerAtom, readGlobal || (!recalculateSplines));
1469     // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
1470
1471     pme_gpu_start_timing(pmeGpu, timingId);
1472     auto*       timingEvent     = pme_gpu_fetch_timing_event(pmeGpu, timingId);
1473     const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
1474 #if c_canEmbedBuffers
1475     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
1476 #else
1477     const auto kernelArgs = prepareGpuKernelArguments(
1478             kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients,
1479             &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->atoms.d_theta,
1480             &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
1481             &kernelParamsPtr->atoms.d_forces);
1482 #endif
1483     launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
1484     pme_gpu_stop_timing(pmeGpu, timingId);
1485
1486     if (pmeGpu->settings.useGpuForceReduction)
1487     {
1488         pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
1489     }
1490     else
1491     {
1492         pme_gpu_copy_output_forces(pmeGpu);
1493     }
1494 }
1495
1496 void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
1497 {
1498     if (pmeGpu && pmeGpu->kernelParams)
1499     {
1500         return pmeGpu->kernelParams->atoms.d_forces;
1501     }
1502     else
1503     {
1504         return nullptr;
1505     }
1506 }
1507
1508 void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
1509 {
1510     GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
1511                "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
1512                "initialized.");
1513
1514     GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms),
1515                "The device-side buffer can not be set.");
1516
1517     pmeGpu->kernelParams->atoms.d_coordinates = d_x;
1518 }
1519
1520 void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
1521 {
1522     if (pmeGpu)
1523     {
1524         return static_cast<void*>(&pmeGpu->archSpecific->pmeStream);
1525     }
1526     else
1527     {
1528         return nullptr;
1529     }
1530 }
1531
1532 void* pme_gpu_get_context(const PmeGpu* pmeGpu)
1533 {
1534     if (pmeGpu)
1535     {
1536         return static_cast<void*>(&pmeGpu->archSpecific->context);
1537     }
1538     else
1539     {
1540         return nullptr;
1541     }
1542 }
1543
1544 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
1545 {
1546     if (pmeGpu && pmeGpu->kernelParams)
1547     {
1548         return &pmeGpu->archSpecific->pmeForcesReady;
1549     }
1550     else
1551     {
1552         return nullptr;
1553     }
1554 }