2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file contains internal CUDA function implementations
38 * for performing the PME calculations on GPU.
40 * \author Aleksei Iupinov <a.yupinov@gmail.com>
50 #include "gromacs/gpu_utils/cudautils.cuh"
51 #include "gromacs/gpu_utils/devicebuffer.h"
52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
53 #include "gromacs/utility/gmxassert.h"
54 #include "gromacs/utility/smalloc.h"
57 #include "pme-3dfft.cuh"
60 int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
62 const int order = pmeGpu->common->pme_order;
63 GMX_ASSERT(order > 0, "Invalid PME order");
64 return PME_ATOM_DATA_ALIGNMENT;
67 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
69 const int order = pmeGpu->common->pme_order;
70 GMX_ASSERT(order > 0, "Invalid PME order");
71 return PME_SPREADGATHER_ATOMS_PER_WARP;
74 void pme_gpu_synchronize(const PmeGpu *pmeGpu)
76 cudaError_t stat = cudaStreamSynchronize(pmeGpu->archSpecific->pmeStream);
77 CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
80 void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
82 const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
83 cudaError_t stat = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
84 CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
85 pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
88 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
90 cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
91 CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
92 pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
93 pfree(pmeGpu->staging.h_virialAndEnergy);
94 pmeGpu->staging.h_virialAndEnergy = nullptr;
97 void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu)
99 cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
100 c_virialAndEnergyCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
101 CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
104 void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
106 const int splineValuesOffset[DIM] = {
108 pmeGpu->kernelParams->grid.realGridSize[XX],
109 pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY]
111 memcpy((void *)&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
113 const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] +
114 pmeGpu->kernelParams->grid.realGridSize[YY] +
115 pmeGpu->kernelParams->grid.realGridSize[ZZ];
116 const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
117 reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
118 &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
121 /* Reallocate the host buffer */
122 pfree(pmeGpu->staging.h_splineModuli);
123 pmalloc((void **)&pmeGpu->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
125 for (int i = 0; i < DIM; i++)
127 memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i], pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
129 /* TODO: pin original buffer instead! */
130 cu_copy_H2D(pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
131 newSplineValuesSize * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
134 void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu)
136 pfree(pmeGpu->staging.h_splineModuli);
137 freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
140 void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
142 const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
143 GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
144 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
145 &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
146 pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
147 pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
150 void pme_gpu_free_forces(const PmeGpu *pmeGpu)
152 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
155 void pme_gpu_copy_input_forces(PmeGpu *pmeGpu)
157 const size_t forcesSize = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
158 GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
159 cu_copy_H2D(pmeGpu->kernelParams->atoms.d_forces, pmeGpu->staging.h_forces.data(), forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
162 void pme_gpu_copy_output_forces(PmeGpu *pmeGpu)
164 const size_t forcesSize = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
165 GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
166 cu_copy_D2H(pmeGpu->staging.h_forces.data(), pmeGpu->kernelParams->atoms.d_forces, forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
169 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
171 const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
172 GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
173 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
174 &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
177 const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
178 const size_t paddingCount = DIM * pmeGpu->nAtomsAlloc - paddingIndex;
179 if (paddingCount > 0)
181 cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
182 CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
187 void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
189 GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
191 GMX_RELEASE_ASSERT(false, "Only single precision is supported");
192 GMX_UNUSED_VALUE(h_coordinates);
194 cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
195 pmeGpu->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
199 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
201 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
204 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const float *h_coefficients)
206 GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
207 const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
208 GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
209 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
210 &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
211 cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
212 pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
215 const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
216 const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
217 if (paddingCount > 0)
219 cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
220 CU_RET_ERR(stat, "PME failed to clear the padded charges");
225 void pme_gpu_free_coefficients(const PmeGpu *pmeGpu)
227 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
230 void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
232 const int order = pmeGpu->common->pme_order;
233 const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
234 const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
235 const int newSplineDataSize = DIM * order * nAtomsPadded;
236 GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
237 /* Two arrays of the same size */
238 const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
239 int currentSizeTemp = pmeGpu->archSpecific->splineDataSize;
240 int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
241 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
242 ¤tSizeTemp, ¤tSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
243 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
244 &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
245 // the host side reallocation
248 pfree(pmeGpu->staging.h_theta);
249 pmalloc((void **)&pmeGpu->staging.h_theta, newSplineDataSize * sizeof(float));
250 pfree(pmeGpu->staging.h_dtheta);
251 pmalloc((void **)&pmeGpu->staging.h_dtheta, newSplineDataSize * sizeof(float));
255 void pme_gpu_free_spline_data(const PmeGpu *pmeGpu)
257 /* Two arrays of the same size */
258 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
259 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
260 pfree(pmeGpu->staging.h_theta);
261 pfree(pmeGpu->staging.h_dtheta);
264 void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
266 const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
267 GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
268 reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
269 &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
270 pfree(pmeGpu->staging.h_gridlineIndices);
271 pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
274 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu)
276 freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
277 pfree(pmeGpu->staging.h_gridlineIndices);
280 void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
282 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
283 const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
284 kernelParamsPtr->grid.realGridSizePadded[YY] *
285 kernelParamsPtr->grid.realGridSizePadded[ZZ];
286 const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
287 kernelParamsPtr->grid.complexGridSizePadded[YY] *
288 kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
289 // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
290 if (pmeGpu->archSpecific->performOutOfPlaceFFT)
292 /* 2 separate grids */
293 reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
294 &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
295 reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
296 &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
300 /* A single buffer so that any grid will fit */
301 const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
302 reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
303 &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
304 kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
305 pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
306 // the size might get used later for copying the grid
310 void pme_gpu_free_grids(const PmeGpu *pmeGpu)
312 if (pmeGpu->archSpecific->performOutOfPlaceFFT)
314 freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
316 freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
319 void pme_gpu_clear_grids(const PmeGpu *pmeGpu)
321 cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->grid.d_realGrid, 0,
322 pmeGpu->archSpecific->realGridSize * sizeof(float), pmeGpu->archSpecific->pmeStream);
323 /* Should the complex grid be cleared in some weird case? */
324 CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
327 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
329 pme_gpu_free_fract_shifts(pmeGpu);
331 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
333 const int nx = kernelParamsPtr->grid.realGridSize[XX];
334 const int ny = kernelParamsPtr->grid.realGridSize[YY];
335 const int nz = kernelParamsPtr->grid.realGridSize[ZZ];
336 const int cellCount = c_pmeNeighborUnitcellCount;
337 const int gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
339 memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
341 const int newFractShiftsSize = cellCount * (nx + ny + nz);
343 initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
344 kernelParamsPtr->fractShiftsTableTexture,
345 pmeGpu->common->fsh.data(),
349 initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
350 kernelParamsPtr->gridlineIndicesTableTexture,
351 pmeGpu->common->nn.data(),
356 void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
358 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
359 destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
360 kernelParamsPtr->fractShiftsTableTexture,
362 destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
363 kernelParamsPtr->gridlineIndicesTableTexture,
367 bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
369 return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
372 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
374 const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
375 cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
378 void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
380 const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
381 cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
382 cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
383 CU_RET_ERR(stat, "PME spread grid sync event record failure");
386 void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
388 const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
389 const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
390 const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
391 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
392 cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
393 cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
394 cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
395 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
398 void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
400 const int alignment = pme_gpu_get_atoms_per_warp(pmeGpu);
401 const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
402 const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
403 auto *kernelParamsPtr = pmeGpu->kernelParams.get();
406 const size_t gridlineIndicesSizePerAtom = DIM * sizeof(int);
407 const size_t splineDataSizePerAtom = pmeGpu->common->pme_order * DIM * sizeof(float);
408 // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
409 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * gridlineIndicesSizePerAtom, pmeGpu->archSpecific->pmeStream),
410 "PME failed to clear the gridline indices");
411 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
412 "PME failed to clear the spline derivatives");
413 CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
414 "PME failed to clear the spline values");
416 cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
417 cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
418 cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
419 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
422 void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu)
424 cudaError_t stat = cudaEventSynchronize(pmeGpu->archSpecific->syncSpreadGridD2H);
425 CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
428 void pme_gpu_init_internal(PmeGpu *pmeGpu)
430 /* Allocate the target-specific structures */
431 pmeGpu->archSpecific.reset(new PmeGpuSpecific());
432 pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
434 pmeGpu->archSpecific->performOutOfPlaceFFT = true;
435 /* This should give better performance, according to the cuFFT documentation.
436 * The performance seems to be the same though.
437 * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
440 /* WARNING: CUDA timings are incorrect with multiple streams.
441 * This is the main reason why they are disabled by default.
443 // TODO: Consider turning on by default when we can detect nr of streams.
444 pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
446 pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
448 /* Creating a PME CUDA stream */
450 int highest_priority, lowest_priority;
451 stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
452 CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
453 stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
454 cudaStreamDefault, //cudaStreamNonBlocking,
456 CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
459 void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
461 /* Destroy the CUDA stream */
462 cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
463 CU_RET_ERR(stat, "PME cudaStreamDestroy error");
466 void pme_gpu_init_sync_events(const PmeGpu *pmeGpu)
468 const auto eventFlags = cudaEventDisableTiming;
469 CU_RET_ERR(cudaEventCreateWithFlags(&pmeGpu->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
472 void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu)
474 CU_RET_ERR(cudaEventDestroy(pmeGpu->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
477 void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
479 if (pme_gpu_performs_FFT(pmeGpu))
481 pmeGpu->archSpecific->fftSetup.resize(0);
482 for (int i = 0; i < pmeGpu->common->ngrids; i++)
484 pmeGpu->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGpu)));
489 void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu)
491 pmeGpu->archSpecific->fftSetup.resize(0);
494 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int warpIndex, int atomWarpIndex)
500 constexpr int fixedOrder = 4;
501 GMX_UNUSED_VALUE(fixedOrder);
502 const int indexBase = getSplineParamIndexBase<fixedOrder>(warpIndex, atomWarpIndex);
503 return getSplineParamIndex<fixedOrder>(indexBase, dimIndex, splineIndex);