bd67a01ee745e7c9c78fd23a8a51434b795a7b6f
[alexxy/gromacs.git] / src / gromacs / ewald / pme.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  * \brief This file contains internal CUDA function implementations
38  * for performing the PME calculations on GPU.
39  *
40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
41  */
42
43 #include "gmxpre.h"
44
45 #include <cmath>
46
47 /* The rest */
48 #include "pme.h"
49
50 #include "gromacs/gpu_utils/cudautils.cuh"
51 #include "gromacs/gpu_utils/pmalloc_cuda.h"
52 #include "gromacs/utility/gmxassert.h"
53 #include "gromacs/utility/smalloc.h"
54
55 #include "pme.cuh"
56 #include "pme-3dfft.cuh"
57 #include "pme-grid.h"
58
59 int pme_gpu_get_atom_data_alignment(const pme_gpu_t *pmeGPU)
60 {
61     const int order = pmeGPU->common->pme_order;
62     GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
63     return PME_ATOM_DATA_ALIGNMENT;
64 }
65
66 int pme_gpu_get_atom_spline_data_alignment(const pme_gpu_t *pmeGPU)
67 {
68     const int order = pmeGPU->common->pme_order;
69     GMX_RELEASE_ASSERT(order > 0, "Invalid PME order");
70     return PME_SPREADGATHER_ATOMS_PER_WARP;
71 }
72
73 void pme_gpu_synchronize(const pme_gpu_t *pmeGPU)
74 {
75     cudaError_t stat = cudaStreamSynchronize(pmeGPU->archSpecific->pmeStream);
76     CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
77 }
78
79 void pme_gpu_alloc_energy_virial(const pme_gpu_t *pmeGPU)
80 {
81     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
82     cudaError_t  stat                = cudaMalloc((void **)&pmeGPU->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
83     CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
84     pmalloc((void **)&pmeGPU->staging.h_virialAndEnergy, energyAndVirialSize);
85 }
86
87 void pme_gpu_free_energy_virial(pme_gpu_t *pmeGPU)
88 {
89     cudaError_t stat = cudaFree(pmeGPU->kernelParams->constants.d_virialAndEnergy);
90     CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
91     pmeGPU->kernelParams->constants.d_virialAndEnergy = nullptr;
92     pfree(pmeGPU->staging.h_virialAndEnergy);
93     pmeGPU->staging.h_virialAndEnergy = nullptr;
94 }
95
96 void pme_gpu_clear_energy_virial(const pme_gpu_t *pmeGPU)
97 {
98     cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->constants.d_virialAndEnergy, 0,
99                                        c_virialAndEnergyCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
100     CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
101 }
102
103 void pme_gpu_realloc_and_copy_bspline_values(const pme_gpu_t *pmeGPU)
104 {
105     const int splineValuesOffset[DIM] = {
106         0,
107         pmeGPU->kernelParams->grid.realGridSize[XX],
108         pmeGPU->kernelParams->grid.realGridSize[XX] + pmeGPU->kernelParams->grid.realGridSize[YY]
109     };
110     memcpy((void *)&pmeGPU->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
111
112     const int newSplineValuesSize = pmeGPU->kernelParams->grid.realGridSize[XX] +
113         pmeGPU->kernelParams->grid.realGridSize[YY] +
114         pmeGPU->kernelParams->grid.realGridSize[ZZ];
115     const bool shouldRealloc = (newSplineValuesSize > pmeGPU->archSpecific->splineValuesSize);
116     cu_realloc_buffered((void **)&pmeGPU->kernelParams->grid.d_splineModuli, nullptr, sizeof(float),
117                         &pmeGPU->archSpecific->splineValuesSize, &pmeGPU->archSpecific->splineValuesSizeAlloc, newSplineValuesSize, pmeGPU->archSpecific->pmeStream, true);
118     if (shouldRealloc)
119     {
120         /* Reallocate the host buffer */
121         pfree(pmeGPU->staging.h_splineModuli);
122         pmalloc((void **)&pmeGPU->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
123     }
124     for (int i = 0; i < DIM; i++)
125     {
126         memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
127     }
128     /* TODO: pin original buffer instead! */
129     cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
130                       newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
131 }
132
133 void pme_gpu_free_bspline_values(const pme_gpu_t *pmeGPU)
134 {
135     pfree(pmeGPU->staging.h_splineModuli);
136     cu_free_buffered(pmeGPU->kernelParams->grid.d_splineModuli, &pmeGPU->archSpecific->splineValuesSize,
137                      &pmeGPU->archSpecific->splineValuesSizeAlloc);
138 }
139
140 void pme_gpu_realloc_forces(const pme_gpu_t *pmeGPU)
141 {
142     const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
143     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
144     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
145                         &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
146 }
147
148 void pme_gpu_free_forces(const pme_gpu_t *pmeGPU)
149 {
150     cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
151 }
152
153 void pme_gpu_copy_input_forces(const pme_gpu_t *pmeGPU, const float *h_forces)
154 {
155     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
156     const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
157     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
158     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
159 }
160
161 void pme_gpu_copy_output_forces(const pme_gpu_t *pmeGPU, float *h_forces)
162 {
163     GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
164     const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
165     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
166     cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
167     cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
168     CU_RET_ERR(stat, "PME gather forces synchronization failure");
169 }
170
171 void pme_gpu_sync_output_forces(const pme_gpu_t *pmeGPU)
172 {
173     cudaStream_t s    = pmeGPU->archSpecific->pmeStream;
174     cudaError_t  stat = cudaStreamWaitEvent(s, pmeGPU->archSpecific->syncForcesD2H, 0);
175     CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
176 }
177
178 void pme_gpu_realloc_coordinates(const pme_gpu_t *pmeGPU)
179 {
180     const size_t newCoordinatesSize = pmeGPU->nAtomsAlloc * DIM;
181     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
182     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coordinates, nullptr, sizeof(float),
183                         &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc, newCoordinatesSize, pmeGPU->archSpecific->pmeStream, true);
184     if (c_usePadding)
185     {
186         const size_t paddingIndex = DIM * pmeGPU->kernelParams->atoms.nAtoms;
187         const size_t paddingCount = DIM * pmeGPU->nAtomsAlloc - paddingIndex;
188         if (paddingCount > 0)
189         {
190             cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
191             CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
192         }
193     }
194 }
195
196 void pme_gpu_copy_input_coordinates(const pme_gpu_t *pmeGPU, const rvec *h_coordinates)
197 {
198     GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
199 #if GMX_DOUBLE
200     GMX_RELEASE_ASSERT(false, "Only single precision is supported");
201     GMX_UNUSED_VALUE(h_coordinates);
202 #else
203     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
204                       pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
205 #endif
206 }
207
208 void pme_gpu_free_coordinates(const pme_gpu_t *pmeGPU)
209 {
210     cu_free_buffered(pmeGPU->kernelParams->atoms.d_coordinates, &pmeGPU->archSpecific->coordinatesSize, &pmeGPU->archSpecific->coordinatesSizeAlloc);
211 }
212
213 void pme_gpu_realloc_and_copy_input_coefficients(const pme_gpu_t *pmeGPU, const float *h_coefficients)
214 {
215     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
216     const size_t newCoefficientsSize = pmeGPU->nAtomsAlloc;
217     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
218     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
219                         &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
220                         newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
221     cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
222                       pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
223     if (c_usePadding)
224     {
225         const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
226         const size_t paddingCount = pmeGPU->nAtomsAlloc - paddingIndex;
227         if (paddingCount > 0)
228         {
229             cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGPU->archSpecific->pmeStream);
230             CU_RET_ERR(stat, "PME failed to clear the padded charges");
231         }
232     }
233 }
234
235 void pme_gpu_free_coefficients(const pme_gpu_t *pmeGPU)
236 {
237     cu_free_buffered(pmeGPU->kernelParams->atoms.d_coefficients, &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc);
238 }
239
240 void pme_gpu_realloc_spline_data(const pme_gpu_t *pmeGPU)
241 {
242     const int    order             = pmeGPU->common->pme_order;
243     const int    alignment         = pme_gpu_get_atom_spline_data_alignment(pmeGPU);
244     const size_t nAtomsPadded      = ((pmeGPU->nAtomsAlloc + alignment - 1) / alignment) * alignment;
245     const int    newSplineDataSize = DIM * order * nAtomsPadded;
246     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
247     /* Two arrays of the same size */
248     const bool shouldRealloc        = (newSplineDataSize > pmeGPU->archSpecific->splineDataSize);
249     int        currentSizeTemp      = pmeGPU->archSpecific->splineDataSize;
250     int        currentSizeTempAlloc = pmeGPU->archSpecific->splineDataSizeAlloc;
251     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_theta, nullptr, sizeof(float),
252                         &currentSizeTemp, &currentSizeTempAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
253     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_dtheta, nullptr, sizeof(float),
254                         &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc, newSplineDataSize, pmeGPU->archSpecific->pmeStream, true);
255     // the host side reallocation
256     if (shouldRealloc)
257     {
258         pfree(pmeGPU->staging.h_theta);
259         pmalloc((void **)&pmeGPU->staging.h_theta, newSplineDataSize * sizeof(float));
260         pfree(pmeGPU->staging.h_dtheta);
261         pmalloc((void **)&pmeGPU->staging.h_dtheta, newSplineDataSize * sizeof(float));
262     }
263 }
264
265 void pme_gpu_free_spline_data(const pme_gpu_t *pmeGPU)
266 {
267     /* Two arrays of the same size */
268     cu_free_buffered(pmeGPU->kernelParams->atoms.d_theta);
269     cu_free_buffered(pmeGPU->kernelParams->atoms.d_dtheta, &pmeGPU->archSpecific->splineDataSize, &pmeGPU->archSpecific->splineDataSizeAlloc);
270     pfree(pmeGPU->staging.h_theta);
271     pfree(pmeGPU->staging.h_dtheta);
272 }
273
274 void pme_gpu_realloc_grid_indices(const pme_gpu_t *pmeGPU)
275 {
276     const size_t newIndicesSize = DIM * pmeGPU->nAtomsAlloc;
277     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
278     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_gridlineIndices, nullptr, sizeof(int),
279                         &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc, newIndicesSize, pmeGPU->archSpecific->pmeStream, true);
280     pfree(pmeGPU->staging.h_gridlineIndices);
281     pmalloc((void **)&pmeGPU->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
282 }
283
284 void pme_gpu_free_grid_indices(const pme_gpu_t *pmeGPU)
285 {
286     cu_free_buffered(pmeGPU->kernelParams->atoms.d_gridlineIndices, &pmeGPU->archSpecific->gridlineIndicesSize, &pmeGPU->archSpecific->gridlineIndicesSizeAlloc);
287     pfree(pmeGPU->staging.h_gridlineIndices);
288 }
289
290 void pme_gpu_realloc_grids(pme_gpu_t *pmeGPU)
291 {
292     auto     *kernelParamsPtr = pmeGPU->kernelParams.get();
293     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
294         kernelParamsPtr->grid.realGridSizePadded[YY] *
295         kernelParamsPtr->grid.realGridSizePadded[ZZ];
296     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
297         kernelParamsPtr->grid.complexGridSizePadded[YY] *
298         kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
299     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
300     if (pmeGPU->archSpecific->performOutOfPlaceFFT)
301     {
302         /* 2 separate grids */
303         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fourierGrid, nullptr, sizeof(float),
304                             &pmeGPU->archSpecific->complexGridSize, &pmeGPU->archSpecific->complexGridSizeAlloc,
305                             newComplexGridSize, pmeGPU->archSpecific->pmeStream, true);
306         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
307                             &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
308                             newRealGridSize, pmeGPU->archSpecific->pmeStream, true);
309     }
310     else
311     {
312         /* A single buffer so that any grid will fit */
313         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
314         cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_realGrid, nullptr, sizeof(float),
315                             &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc,
316                             newGridsSize, pmeGPU->archSpecific->pmeStream, true);
317         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
318         pmeGPU->archSpecific->complexGridSize = pmeGPU->archSpecific->realGridSize;
319         // the size might get used later for copying the grid
320     }
321 }
322
323 void pme_gpu_free_grids(const pme_gpu_t *pmeGPU)
324 {
325     if (pmeGPU->archSpecific->performOutOfPlaceFFT)
326     {
327         cu_free_buffered(pmeGPU->kernelParams->grid.d_fourierGrid);
328     }
329     cu_free_buffered(pmeGPU->kernelParams->grid.d_realGrid,
330                      &pmeGPU->archSpecific->realGridSize, &pmeGPU->archSpecific->realGridSizeAlloc);
331 }
332
333 void pme_gpu_clear_grids(const pme_gpu_t *pmeGPU)
334 {
335     cudaError_t stat = cudaMemsetAsync(pmeGPU->kernelParams->grid.d_realGrid, 0,
336                                        pmeGPU->archSpecific->realGridSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
337     /* Should the complex grid be cleared in some weird case? */
338     CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
339 }
340
341 void pme_gpu_realloc_and_copy_fract_shifts(pme_gpu_t *pmeGPU)
342 {
343     cudaStream_t stream          = pmeGPU->archSpecific->pmeStream;
344     auto        *kernelParamsPtr = pmeGPU->kernelParams.get();
345
346     const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
347     const int    ny                  = kernelParamsPtr->grid.realGridSize[YY];
348     const int    nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
349     const int    cellCount           = c_pmeNeighborUnitcellCount;
350     const int    gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
351
352     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
353
354     const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
355
356     /* Two arrays, same size */
357     int currentSizeTemp      = pmeGPU->archSpecific->fractShiftsSize;
358     int currentSizeTempAlloc = pmeGPU->archSpecific->fractShiftsSizeAlloc;
359     cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_fractShiftsTable, nullptr, sizeof(float),
360                         &currentSizeTemp, &currentSizeTempAlloc,
361                         newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
362     float *fractShiftsArray = kernelParamsPtr->grid.d_fractShiftsTable;
363     cu_realloc_buffered((void **)&kernelParamsPtr->grid.d_gridlineIndicesTable, nullptr, sizeof(int),
364                         &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc,
365                         newFractShiftsSize, pmeGPU->archSpecific->pmeStream, true);
366     int *gridlineIndicesArray = kernelParamsPtr->grid.d_gridlineIndicesTable;
367
368     /* TODO: pinning of the pmeGPU->common->fsh/nn host-side arrays */
369
370     for (int i = 0; i < DIM; i++)
371     {
372         kernelParamsPtr->grid.tablesOffsets[i] = gridDataOffset[i];
373         cu_copy_H2D_async(fractShiftsArray + gridDataOffset[i], pmeGPU->common->fsh[i].data(),
374                           cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(float), stream);
375         cu_copy_H2D_async(gridlineIndicesArray + gridDataOffset[i], pmeGPU->common->nn[i].data(),
376                           cellCount * kernelParamsPtr->grid.realGridSize[i] * sizeof(int), stream);
377     }
378
379     pme_gpu_make_fract_shifts_textures(pmeGPU);
380 }
381
382 void pme_gpu_free_fract_shifts(const pme_gpu_t *pmeGPU)
383 {
384     pme_gpu_free_fract_shifts_textures(pmeGPU);
385     /* Two arrays, same size */
386     cu_free_buffered(pmeGPU->kernelParams->grid.d_fractShiftsTable);
387     cu_free_buffered(pmeGPU->kernelParams->grid.d_gridlineIndicesTable, &pmeGPU->archSpecific->fractShiftsSize, &pmeGPU->archSpecific->fractShiftsSizeAlloc);
388 }
389
390 void pme_gpu_sync_output_energy_virial(const pme_gpu_t *pmeGPU)
391 {
392     cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncEnerVirD2H, 0);
393     CU_RET_ERR(stat, "Error while waiting for PME solve output");
394
395     for (int j = 0; j < c_virialAndEnergyCount; j++)
396     {
397         GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
398     }
399 }
400
401 void pme_gpu_copy_input_gather_grid(const pme_gpu_t *pmeGpu, float *h_grid)
402 {
403     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
404     cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
405 }
406
407 void pme_gpu_copy_output_spread_grid(const pme_gpu_t *pmeGpu, float *h_grid)
408 {
409     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
410     cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
411     cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
412     CU_RET_ERR(stat, "PME spread grid sync event record failure");
413 }
414
415 void pme_gpu_sync_spread_grid(const pme_gpu_t *pmeGPU)
416 {
417     cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSpreadGridD2H, 0);
418     CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
419 }
420
421 void pme_gpu_sync_spline_atom_data(const pme_gpu_t *pmeGPU)
422 {
423     cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSplineAtomDataD2H, 0);
424     CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
425 }
426
427 void pme_gpu_sync_solve_grid(const pme_gpu_t *pmeGPU)
428 {
429     cudaError_t stat = cudaStreamWaitEvent(pmeGPU->archSpecific->pmeStream, pmeGPU->archSpecific->syncSolveGridD2H, 0);
430     CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
431     //should check for pme_gpu_performs_solve(pmeGPU)
432 }
433
434 void pme_gpu_init_internal(pme_gpu_t *pmeGPU)
435 {
436     /* Allocate the target-specific structures */
437     pmeGPU->archSpecific.reset(new pme_gpu_specific_t());
438     pmeGPU->kernelParams.reset(new pme_gpu_kernel_params_t());
439
440     pmeGPU->archSpecific->performOutOfPlaceFFT = true;
441     /* This should give better performance, according to the cuFFT documentation.
442      * The performance seems to be the same though.
443      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
444      */
445
446     pmeGPU->archSpecific->useTiming = (getenv("GMX_DISABLE_CUDA_TIMING") == nullptr) &&
447         (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
448     /* TODO: multiple CUDA streams on same GPU cause nonsense cudaEvent_t timings.
449      * This should probably also check for gpuId exclusivity?
450      */
451
452     /* Creating a PME CUDA stream */
453     cudaError_t stat;
454     int         highest_priority, lowest_priority;
455     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
456     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
457     stat = cudaStreamCreateWithPriority(&pmeGPU->archSpecific->pmeStream,
458                                         cudaStreamDefault, //cudaStreamNonBlocking,
459                                         highest_priority);
460     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
461 }
462
463 void pme_gpu_destroy_specific(const pme_gpu_t *pmeGPU)
464 {
465     /* Destroy the CUDA stream */
466     cudaError_t stat = cudaStreamDestroy(pmeGPU->archSpecific->pmeStream);
467     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
468 }
469
470 void pme_gpu_init_sync_events(const pme_gpu_t *pmeGPU)
471 {
472     cudaError_t stat;
473     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, cudaEventDisableTiming);
474     CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
475     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, cudaEventDisableTiming);
476     CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
477     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, cudaEventDisableTiming);
478     CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
479     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, cudaEventDisableTiming);
480     CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
481     stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, cudaEventDisableTiming);
482     CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
483 }
484
485 void pme_gpu_destroy_sync_events(const pme_gpu_t *pmeGPU)
486 {
487     cudaError_t stat;
488     stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
489     CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
490     stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
491     CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
492     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
493     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
494     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
495     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
496     stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
497     CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
498 }
499
500 void pme_gpu_reinit_3dfft(const pme_gpu_t *pmeGPU)
501 {
502     if (pme_gpu_performs_FFT(pmeGPU))
503     {
504         pmeGPU->archSpecific->fftSetup.resize(0);
505         for (int i = 0; i < pmeGPU->common->ngrids; i++)
506         {
507             pmeGPU->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGPU)));
508         }
509     }
510 }
511
512 void pme_gpu_destroy_3dfft(const pme_gpu_t *pmeGPU)
513 {
514     pmeGPU->archSpecific->fftSetup.resize(0);
515 }