Removed cu_realloc_buffered() in favor of reallocateDeviceBuffer()
[alexxy/gromacs.git] / src / gromacs / ewald / pme.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  * \brief This file contains internal CUDA function implementations
38  * for performing the PME calculations on GPU.
39  *
40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
41  */
42
43 #include "gmxpre.h"
44
45 #include <cmath>
46
47 /* The rest */
48 #include "pme.h"
49
50 #include "gromacs/gpu_utils/cudautils.cuh"
51 #include "gromacs/gpu_utils/devicebuffer.h"
52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
53 #include "gromacs/utility/gmxassert.h"
54 #include "gromacs/utility/smalloc.h"
55
56 #include "pme.cuh"
57 #include "pme-3dfft.cuh"
58 #include "pme-grid.h"
59
60 int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
61 {
62     const int order = pmeGpu->common->pme_order;
63     GMX_ASSERT(order > 0, "Invalid PME order");
64     return PME_ATOM_DATA_ALIGNMENT;
65 }
66
67 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
68 {
69     const int order = pmeGpu->common->pme_order;
70     GMX_ASSERT(order > 0, "Invalid PME order");
71     return PME_SPREADGATHER_ATOMS_PER_WARP;
72 }
73
74 void pme_gpu_synchronize(const PmeGpu *pmeGpu)
75 {
76     cudaError_t stat = cudaStreamSynchronize(pmeGpu->archSpecific->pmeStream);
77     CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
78 }
79
80 void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
81 {
82     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
83     cudaError_t  stat                = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
84     CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
85     pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
86 }
87
88 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
89 {
90     cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
91     CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
92     pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
93     pfree(pmeGpu->staging.h_virialAndEnergy);
94     pmeGpu->staging.h_virialAndEnergy = nullptr;
95 }
96
97 void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu)
98 {
99     cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
100                                        c_virialAndEnergyCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
101     CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
102 }
103
104 void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
105 {
106     const int splineValuesOffset[DIM] = {
107         0,
108         pmeGpu->kernelParams->grid.realGridSize[XX],
109         pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY]
110     };
111     memcpy((void *)&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
112
113     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] +
114         pmeGpu->kernelParams->grid.realGridSize[YY] +
115         pmeGpu->kernelParams->grid.realGridSize[ZZ];
116     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
117     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
118                            &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
119     if (shouldRealloc)
120     {
121         /* Reallocate the host buffer */
122         pfree(pmeGpu->staging.h_splineModuli);
123         pmalloc((void **)&pmeGpu->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
124     }
125     for (int i = 0; i < DIM; i++)
126     {
127         memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i], pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
128     }
129     /* TODO: pin original buffer instead! */
130     cu_copy_H2D(pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
131                 newSplineValuesSize * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
132 }
133
134 void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu)
135 {
136     pfree(pmeGpu->staging.h_splineModuli);
137     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
138 }
139
140 void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
141 {
142     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
143     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
144     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
145                            &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
146     pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
147     pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
148 }
149
150 void pme_gpu_free_forces(const PmeGpu *pmeGpu)
151 {
152     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
153 }
154
155 void pme_gpu_copy_input_forces(PmeGpu *pmeGpu)
156 {
157     const size_t forcesSize = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
158     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
159     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_forces, pmeGpu->staging.h_forces.data(), forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
160 }
161
162 void pme_gpu_copy_output_forces(PmeGpu *pmeGpu)
163 {
164     const size_t forcesSize   = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
165     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
166     cu_copy_D2H(pmeGpu->staging.h_forces.data(), pmeGpu->kernelParams->atoms.d_forces, forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
167 }
168
169 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
170 {
171     const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
172     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
173     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
174                            &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
175     if (c_usePadding)
176     {
177         const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
178         const size_t paddingCount = DIM * pmeGpu->nAtomsAlloc - paddingIndex;
179         if (paddingCount > 0)
180         {
181             cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
182             CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
183         }
184     }
185 }
186
187 void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
188 {
189     GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
190 #if GMX_DOUBLE
191     GMX_RELEASE_ASSERT(false, "Only single precision is supported");
192     GMX_UNUSED_VALUE(h_coordinates);
193 #else
194     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
195                 pmeGpu->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
196 #endif
197 }
198
199 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
200 {
201     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
202 }
203
204 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const float *h_coefficients)
205 {
206     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
207     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
208     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
209     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
210                            &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
211     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
212                 pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
213     if (c_usePadding)
214     {
215         const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
216         const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
217         if (paddingCount > 0)
218         {
219             cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
220             CU_RET_ERR(stat, "PME failed to clear the padded charges");
221         }
222     }
223 }
224
225 void pme_gpu_free_coefficients(const PmeGpu *pmeGpu)
226 {
227     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
228 }
229
230 void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
231 {
232     const int    order             = pmeGpu->common->pme_order;
233     const int    alignment         = pme_gpu_get_atoms_per_warp(pmeGpu);
234     const size_t nAtomsPadded      = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
235     const int    newSplineDataSize = DIM * order * nAtomsPadded;
236     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
237     /* Two arrays of the same size */
238     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
239     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
240     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
241     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
242                            &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
243     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
244                            &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
245     // the host side reallocation
246     if (shouldRealloc)
247     {
248         pfree(pmeGpu->staging.h_theta);
249         pmalloc((void **)&pmeGpu->staging.h_theta, newSplineDataSize * sizeof(float));
250         pfree(pmeGpu->staging.h_dtheta);
251         pmalloc((void **)&pmeGpu->staging.h_dtheta, newSplineDataSize * sizeof(float));
252     }
253 }
254
255 void pme_gpu_free_spline_data(const PmeGpu *pmeGpu)
256 {
257     /* Two arrays of the same size */
258     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
259     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
260     pfree(pmeGpu->staging.h_theta);
261     pfree(pmeGpu->staging.h_dtheta);
262 }
263
264 void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
265 {
266     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
267     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
268     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
269                            &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
270     pfree(pmeGpu->staging.h_gridlineIndices);
271     pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
272 }
273
274 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu)
275 {
276     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
277     pfree(pmeGpu->staging.h_gridlineIndices);
278 }
279
280 void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
281 {
282     auto     *kernelParamsPtr = pmeGpu->kernelParams.get();
283     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
284         kernelParamsPtr->grid.realGridSizePadded[YY] *
285         kernelParamsPtr->grid.realGridSizePadded[ZZ];
286     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
287         kernelParamsPtr->grid.complexGridSizePadded[YY] *
288         kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
289     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
290     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
291     {
292         /* 2 separate grids */
293         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
294                                &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
295         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
296                                &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
297     }
298     else
299     {
300         /* A single buffer so that any grid will fit */
301         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
302         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
303                                &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
304         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
305         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
306         // the size might get used later for copying the grid
307     }
308 }
309
310 void pme_gpu_free_grids(const PmeGpu *pmeGpu)
311 {
312     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
313     {
314         freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
315     }
316     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
317 }
318
319 void pme_gpu_clear_grids(const PmeGpu *pmeGpu)
320 {
321     cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->grid.d_realGrid, 0,
322                                        pmeGpu->archSpecific->realGridSize * sizeof(float), pmeGpu->archSpecific->pmeStream);
323     /* Should the complex grid be cleared in some weird case? */
324     CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
325 }
326
327 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
328 {
329     pme_gpu_free_fract_shifts(pmeGpu);
330
331     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
332
333     const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
334     const int    ny                  = kernelParamsPtr->grid.realGridSize[YY];
335     const int    nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
336     const int    cellCount           = c_pmeNeighborUnitcellCount;
337     const int    gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
338
339     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
340
341     const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
342
343     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
344                          kernelParamsPtr->fractShiftsTableTexture,
345                          pmeGpu->common->fsh.data(),
346                          newFractShiftsSize,
347                          pmeGpu->deviceInfo);
348
349     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
350                          kernelParamsPtr->gridlineIndicesTableTexture,
351                          pmeGpu->common->nn.data(),
352                          newFractShiftsSize,
353                          pmeGpu->deviceInfo);
354 }
355
356 void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
357 {
358     auto *kernelParamsPtr = pmeGpu->kernelParams.get();
359     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
360                             kernelParamsPtr->fractShiftsTableTexture,
361                             pmeGpu->deviceInfo);
362     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
363                             kernelParamsPtr->gridlineIndicesTableTexture,
364                             pmeGpu->deviceInfo);
365 }
366
367 bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
368 {
369     return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
370 }
371
372 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
373 {
374     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
375     cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
376 }
377
378 void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
379 {
380     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
381     cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
382     cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
383     CU_RET_ERR(stat, "PME spread grid sync event record failure");
384 }
385
386 void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
387 {
388     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
389     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
390     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
391     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
392     cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
393     cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
394     cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
395                 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
396 }
397
398 void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
399 {
400     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
401     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
402     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
403     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
404     if (c_usePadding)
405     {
406         const size_t gridlineIndicesSizePerAtom = DIM * sizeof(int);
407         const size_t splineDataSizePerAtom      = pmeGpu->common->pme_order * DIM * sizeof(float);
408         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
409         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * gridlineIndicesSizePerAtom, pmeGpu->archSpecific->pmeStream),
410                    "PME failed to clear the gridline indices");
411         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
412                    "PME failed to clear the spline derivatives");
413         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
414                    "PME failed to clear the spline values");
415     }
416     cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
417     cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
418     cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
419                 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
420 }
421
422 void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu)
423 {
424     cudaError_t stat = cudaEventSynchronize(pmeGpu->archSpecific->syncSpreadGridD2H);
425     CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
426 }
427
428 void pme_gpu_init_internal(PmeGpu *pmeGpu)
429 {
430     /* Allocate the target-specific structures */
431     pmeGpu->archSpecific.reset(new PmeGpuSpecific());
432     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
433
434     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
435     /* This should give better performance, according to the cuFFT documentation.
436      * The performance seems to be the same though.
437      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
438      */
439
440     /* WARNING: CUDA timings are incorrect with multiple streams.
441      *          This is the main reason why they are disabled by default.
442      */
443     // TODO: Consider turning on by default when we can detect nr of streams.
444     pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
445
446     pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
447
448     /* Creating a PME CUDA stream */
449     cudaError_t stat;
450     int         highest_priority, lowest_priority;
451     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
452     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
453     stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
454                                         cudaStreamDefault, //cudaStreamNonBlocking,
455                                         highest_priority);
456     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
457 }
458
459 void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
460 {
461     /* Destroy the CUDA stream */
462     cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
463     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
464 }
465
466 void pme_gpu_init_sync_events(const PmeGpu *pmeGpu)
467 {
468     const auto  eventFlags = cudaEventDisableTiming;
469     CU_RET_ERR(cudaEventCreateWithFlags(&pmeGpu->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
470 }
471
472 void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu)
473 {
474     CU_RET_ERR(cudaEventDestroy(pmeGpu->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
475 }
476
477 void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
478 {
479     if (pme_gpu_performs_FFT(pmeGpu))
480     {
481         pmeGpu->archSpecific->fftSetup.resize(0);
482         for (int i = 0; i < pmeGpu->common->ngrids; i++)
483         {
484             pmeGpu->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGpu)));
485         }
486     }
487 }
488
489 void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu)
490 {
491     pmeGpu->archSpecific->fftSetup.resize(0);
492 }
493
494 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int warpIndex, int atomWarpIndex)
495 {
496     if (order != 4)
497     {
498         throw order;
499     }
500     constexpr int fixedOrder = 4;
501     GMX_UNUSED_VALUE(fixedOrder);
502     const int     indexBase  = getSplineParamIndexBase<fixedOrder>(warpIndex, atomWarpIndex);
503     return getSplineParamIndex<fixedOrder>(indexBase, dimIndex, splineIndex);
504 }