src/gromacs/ewald/pme.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief This file contains internal CUDA function implementations
  38  * for performing the PME calculations on GPU.
  39  *
  40  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include <cmath>
  46
  47 /* The rest */
  48 #include "pme.h"
  49
  50 #include "gromacs/gpu_utils/cudautils.cuh"
  51 #include "gromacs/gpu_utils/devicebuffer.h"
  52 #include "gromacs/gpu_utils/pmalloc_cuda.h"
  53 #include "gromacs/utility/gmxassert.h"
  54 #include "gromacs/utility/smalloc.h"
  55
  56 #include "pme.cuh"
  57 #include "pme-3dfft.cuh"
  58 #include "pme-grid.h"
  59
  60 int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
  61 {
  62     const int order = pmeGpu->common->pme_order;
  63     GMX_ASSERT(order > 0, "Invalid PME order");
  64     return PME_ATOM_DATA_ALIGNMENT;
  65 }
  66
  67 int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
  68 {
  69     const int order = pmeGpu->common->pme_order;
  70     GMX_ASSERT(order > 0, "Invalid PME order");
  71     return PME_SPREADGATHER_ATOMS_PER_WARP;
  72 }
  73
  74 void pme_gpu_synchronize(const PmeGpu *pmeGpu)
  75 {
  76     cudaError_t stat = cudaStreamSynchronize(pmeGpu->archSpecific->pmeStream);
  77     CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
  78 }
  79
  80 void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
  81 {
  82     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
  83     cudaError_t  stat                = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
  84     CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
  85     pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
  86 }
  87
  88 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
  89 {
  90     cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
  91     CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
  92     pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
  93     pfree(pmeGpu->staging.h_virialAndEnergy);
  94     pmeGpu->staging.h_virialAndEnergy = nullptr;
  95 }
  96
  97 void pme_gpu_clear_energy_virial(const PmeGpu *pmeGpu)
  98 {
  99     cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
 100                                        c_virialAndEnergyCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
 101     CU_RET_ERR(stat, "PME energy/virial cudaMemsetAsync error");
 102 }
 103
 104 void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
 105 {
 106     const int splineValuesOffset[DIM] = {
 107         0,
 108         pmeGpu->kernelParams->grid.realGridSize[XX],
 109         pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY]
 110     };
 111     memcpy((void *)&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
 112
 113     const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] +
 114         pmeGpu->kernelParams->grid.realGridSize[YY] +
 115         pmeGpu->kernelParams->grid.realGridSize[ZZ];
 116     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
 117     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
 118                            &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
 119     if (shouldRealloc)
 120     {
 121         /* Reallocate the host buffer */
 122         pfree(pmeGpu->staging.h_splineModuli);
 123         pmalloc((void **)&pmeGpu->staging.h_splineModuli, newSplineValuesSize * sizeof(float));
 124     }
 125     for (int i = 0; i < DIM; i++)
 126     {
 127         memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i], pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
 128     }
 129     /* TODO: pin original buffer instead! */
 130     cu_copy_H2D(pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
 131                 newSplineValuesSize * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 132 }
 133
 134 void pme_gpu_free_bspline_values(const PmeGpu *pmeGpu)
 135 {
 136     pfree(pmeGpu->staging.h_splineModuli);
 137     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
 138 }
 139
 140 void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
 141 {
 142     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
 143     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
 144     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
 145                            &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
 146     pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
 147     pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
 148 }
 149
 150 void pme_gpu_free_forces(const PmeGpu *pmeGpu)
 151 {
 152     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces);
 153 }
 154
 155 void pme_gpu_copy_input_forces(PmeGpu *pmeGpu)
 156 {
 157     const size_t forcesSize = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
 158     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
 159     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_forces, pmeGpu->staging.h_forces.data(), forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 160 }
 161
 162 void pme_gpu_copy_output_forces(PmeGpu *pmeGpu)
 163 {
 164     const size_t forcesSize   = DIM * pmeGpu->kernelParams->atoms.nAtoms * sizeof(float);
 165     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
 166     cu_copy_D2H(pmeGpu->staging.h_forces.data(), pmeGpu->kernelParams->atoms.d_forces, forcesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 167 }
 168
 169 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
 170 {
 171     const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
 172     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
 173     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
 174                            &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
 175     if (c_usePadding)
 176     {
 177         const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
 178         const size_t paddingCount = DIM * pmeGpu->nAtomsAlloc - paddingIndex;
 179         if (paddingCount > 0)
 180         {
 181             cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coordinates + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
 182             CU_RET_ERR(stat, "PME failed to clear the padded coordinates");
 183         }
 184     }
 185 }
 186
 187 void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
 188 {
 189     GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
 190 #if GMX_DOUBLE
 191     GMX_RELEASE_ASSERT(false, "Only single precision is supported");
 192     GMX_UNUSED_VALUE(h_coordinates);
 193 #else
 194     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
 195                 pmeGpu->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 196 #endif
 197 }
 198
 199 void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
 200 {
 201     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
 202 }
 203
 204 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const float *h_coefficients)
 205 {
 206     GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
 207     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
 208     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
 209     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
 210                            &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
 211     cu_copy_H2D(pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
 212                 pmeGpu->kernelParams->atoms.nAtoms * sizeof(float), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 213     if (c_usePadding)
 214     {
 215         const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
 216         const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
 217         if (paddingCount > 0)
 218         {
 219             cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->atoms.d_coefficients + paddingIndex, 0, paddingCount * sizeof(float), pmeGpu->archSpecific->pmeStream);
 220             CU_RET_ERR(stat, "PME failed to clear the padded charges");
 221         }
 222     }
 223 }
 224
 225 void pme_gpu_free_coefficients(const PmeGpu *pmeGpu)
 226 {
 227     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
 228 }
 229
 230 void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
 231 {
 232     const int    order             = pmeGpu->common->pme_order;
 233     const int    alignment         = pme_gpu_get_atoms_per_warp(pmeGpu);
 234     const size_t nAtomsPadded      = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 235     const int    newSplineDataSize = DIM * order * nAtomsPadded;
 236     GMX_ASSERT(newSplineDataSize > 0, "Bad number of atoms in PME GPU");
 237     /* Two arrays of the same size */
 238     const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
 239     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
 240     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
 241     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
 242                            &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
 243     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
 244                            &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
 245     // the host side reallocation
 246     if (shouldRealloc)
 247     {
 248         pfree(pmeGpu->staging.h_theta);
 249         pmalloc((void **)&pmeGpu->staging.h_theta, newSplineDataSize * sizeof(float));
 250         pfree(pmeGpu->staging.h_dtheta);
 251         pmalloc((void **)&pmeGpu->staging.h_dtheta, newSplineDataSize * sizeof(float));
 252     }
 253 }
 254
 255 void pme_gpu_free_spline_data(const PmeGpu *pmeGpu)
 256 {
 257     /* Two arrays of the same size */
 258     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta);
 259     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta);
 260     pfree(pmeGpu->staging.h_theta);
 261     pfree(pmeGpu->staging.h_dtheta);
 262 }
 263
 264 void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
 265 {
 266     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
 267     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
 268     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
 269                            &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
 270     pfree(pmeGpu->staging.h_gridlineIndices);
 271     pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
 272 }
 273
 274 void pme_gpu_free_grid_indices(const PmeGpu *pmeGpu)
 275 {
 276     freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices);
 277     pfree(pmeGpu->staging.h_gridlineIndices);
 278 }
 279
 280 void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
 281 {
 282     auto     *kernelParamsPtr = pmeGpu->kernelParams.get();
 283     const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] *
 284         kernelParamsPtr->grid.realGridSizePadded[YY] *
 285         kernelParamsPtr->grid.realGridSizePadded[ZZ];
 286     const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] *
 287         kernelParamsPtr->grid.complexGridSizePadded[YY] *
 288         kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
 289     // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
 290     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
 291     {
 292         /* 2 separate grids */
 293         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
 294                                &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
 295         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
 296                                &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
 297     }
 298     else
 299     {
 300         /* A single buffer so that any grid will fit */
 301         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
 302         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
 303                                &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
 304         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
 305         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
 306         // the size might get used later for copying the grid
 307     }
 308 }
 309
 310 void pme_gpu_free_grids(const PmeGpu *pmeGpu)
 311 {
 312     if (pmeGpu->archSpecific->performOutOfPlaceFFT)
 313     {
 314         freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
 315     }
 316     freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
 317 }
 318
 319 void pme_gpu_clear_grids(const PmeGpu *pmeGpu)
 320 {
 321     cudaError_t stat = cudaMemsetAsync(pmeGpu->kernelParams->grid.d_realGrid, 0,
 322                                        pmeGpu->archSpecific->realGridSize * sizeof(float), pmeGpu->archSpecific->pmeStream);
 323     /* Should the complex grid be cleared in some weird case? */
 324     CU_RET_ERR(stat, "cudaMemsetAsync on the PME grid error");
 325 }
 326
 327 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
 328 {
 329     pme_gpu_free_fract_shifts(pmeGpu);
 330
 331     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
 332
 333     const int    nx                  = kernelParamsPtr->grid.realGridSize[XX];
 334     const int    ny                  = kernelParamsPtr->grid.realGridSize[YY];
 335     const int    nz                  = kernelParamsPtr->grid.realGridSize[ZZ];
 336     const int    cellCount           = c_pmeNeighborUnitcellCount;
 337     const int    gridDataOffset[DIM] = {0, cellCount * nx, cellCount * (nx + ny)};
 338
 339     memcpy(kernelParamsPtr->grid.tablesOffsets, &gridDataOffset, sizeof(gridDataOffset));
 340
 341     const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
 342
 343     initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
 344                          kernelParamsPtr->fractShiftsTableTexture,
 345                          pmeGpu->common->fsh.data(),
 346                          newFractShiftsSize,
 347                          pmeGpu->deviceInfo);
 348
 349     initParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 350                          kernelParamsPtr->gridlineIndicesTableTexture,
 351                          pmeGpu->common->nn.data(),
 352                          newFractShiftsSize,
 353                          pmeGpu->deviceInfo);
 354 }
 355
 356 void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
 357 {
 358     auto *kernelParamsPtr = pmeGpu->kernelParams.get();
 359     destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
 360                             kernelParamsPtr->fractShiftsTableTexture,
 361                             pmeGpu->deviceInfo);
 362     destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
 363                             kernelParamsPtr->gridlineIndicesTableTexture,
 364                             pmeGpu->deviceInfo);
 365 }
 366
 367 bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
 368 {
 369     return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
 370 }
 371
 372 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
 373 {
 374     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
 375     cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 376 }
 377
 378 void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
 379 {
 380     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
 381     cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 382     cudaError_t  stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
 383     CU_RET_ERR(stat, "PME spread grid sync event record failure");
 384 }
 385
 386 void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
 387 {
 388     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 389     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 390     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
 391     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
 392     cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 393     cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 394     cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
 395                 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 396 }
 397
 398 void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
 399 {
 400     const int    alignment       = pme_gpu_get_atoms_per_warp(pmeGpu);
 401     const size_t nAtomsPadded    = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
 402     const size_t splinesSize     = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
 403     auto        *kernelParamsPtr = pmeGpu->kernelParams.get();
 404     if (c_usePadding)
 405     {
 406         const size_t gridlineIndicesSizePerAtom = DIM * sizeof(int);
 407         const size_t splineDataSizePerAtom      = pmeGpu->common->pme_order * DIM * sizeof(float);
 408         // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
 409         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * gridlineIndicesSizePerAtom, pmeGpu->archSpecific->pmeStream),
 410                    "PME failed to clear the gridline indices");
 411         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
 412                    "PME failed to clear the spline derivatives");
 413         CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
 414                    "PME failed to clear the spline values");
 415     }
 416     cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 417     cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 418     cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
 419                 kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
 420 }
 421
 422 void pme_gpu_sync_spread_grid(const PmeGpu *pmeGpu)
 423 {
 424     cudaError_t stat = cudaEventSynchronize(pmeGpu->archSpecific->syncSpreadGridD2H);
 425     CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
 426 }
 427
 428 void pme_gpu_init_internal(PmeGpu *pmeGpu)
 429 {
 430     /* Allocate the target-specific structures */
 431     pmeGpu->archSpecific.reset(new PmeGpuSpecific());
 432     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
 433
 434     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
 435     /* This should give better performance, according to the cuFFT documentation.
 436      * The performance seems to be the same though.
 437      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
 438      */
 439
 440     /* WARNING: CUDA timings are incorrect with multiple streams.
 441      *          This is the main reason why they are disabled by default.
 442      */
 443     // TODO: Consider turning on by default when we can detect nr of streams.
 444     pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
 445
 446     pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
 447
 448     /* Creating a PME CUDA stream */
 449     cudaError_t stat;
 450     int         highest_priority, lowest_priority;
 451     stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
 452     CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
 453     stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
 454                                         cudaStreamDefault, //cudaStreamNonBlocking,
 455                                         highest_priority);
 456     CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
 457 }
 458
 459 void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
 460 {
 461     /* Destroy the CUDA stream */
 462     cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
 463     CU_RET_ERR(stat, "PME cudaStreamDestroy error");
 464 }
 465
 466 void pme_gpu_init_sync_events(const PmeGpu *pmeGpu)
 467 {
 468     const auto  eventFlags = cudaEventDisableTiming;
 469     CU_RET_ERR(cudaEventCreateWithFlags(&pmeGpu->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
 470 }
 471
 472 void pme_gpu_destroy_sync_events(const PmeGpu *pmeGpu)
 473 {
 474     CU_RET_ERR(cudaEventDestroy(pmeGpu->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
 475 }
 476
 477 void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
 478 {
 479     if (pme_gpu_performs_FFT(pmeGpu))
 480     {
 481         pmeGpu->archSpecific->fftSetup.resize(0);
 482         for (int i = 0; i < pmeGpu->common->ngrids; i++)
 483         {
 484             pmeGpu->archSpecific->fftSetup.push_back(std::unique_ptr<GpuParallel3dFft>(new GpuParallel3dFft(pmeGpu)));
 485         }
 486     }
 487 }
 488
 489 void pme_gpu_destroy_3dfft(const PmeGpu *pmeGpu)
 490 {
 491     pmeGpu->archSpecific->fftSetup.resize(0);
 492 }
 493
 494 int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int warpIndex, int atomWarpIndex)
 495 {
 496     if (order != 4)
 497     {
 498         throw order;
 499     }
 500     constexpr int fixedOrder = 4;
 501     GMX_UNUSED_VALUE(fixedOrder);
 502     const int     indexBase  = getSplineParamIndexBase<fixedOrder>(warpIndex, atomWarpIndex);
 503     return getSplineParamIndex<fixedOrder>(indexBase, dimIndex, splineIndex);
 504 }