Apply re-formatting to C++ in src/ tree.

[alexxy/gromacs.git] / src / gromacs / ewald / pme_gpu_internal.cpp
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index c67e471ca3c86118b89d4061e046c8b824bebe1d..e78373ec8b9dc9547fc2e2bd13a25da6cb79a3f8 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -142,7 +142,8 @@ void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
      for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
      {
          allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
-                             c_virialAndEnergyCount, pmeGpu->archSpecific->deviceContext_);
+                             c_virialAndEnergyCount,
+                             pmeGpu->archSpecific->deviceContext_);
          pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy[gridIndex]), energyAndVirialSize);
      }
  }
@@ -161,8 +162,10 @@ void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
  {
      for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
      {
-        clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], 0,
-                               c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
+                               0,
+                               c_virialAndEnergyCount,
+                               pmeGpu->archSpecific->pmeStream_);
      }
  }
  
@@ -174,7 +177,8 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex
      GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
                 "Invalid combination of gridIndex and number of grids");
  
-    const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
+    const int splineValuesOffset[DIM] = { 0,
+                                          pmeGpu->kernelParams->grid.realGridSize[XX],
                                            pmeGpu->kernelParams->grid.realGridSize[XX]
                                                    + pmeGpu->kernelParams->grid.realGridSize[YY] };
      memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
@@ -184,7 +188,8 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex
                                      + pmeGpu->kernelParams->grid.realGridSize[ZZ];
      const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize[gridIndex]);
      reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
-                           newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize[gridIndex],
+                           newSplineValuesSize,
+                           &pmeGpu->archSpecific->splineValuesSize[gridIndex],
                             &pmeGpu->archSpecific->splineValuesCapacity[gridIndex],
                             pmeGpu->archSpecific->deviceContext_);
      if (shouldRealloc)
@@ -197,12 +202,17 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex
      for (int i = 0; i < DIM; i++)
      {
          memcpy(pmeGpu->staging.h_splineModuli[gridIndex] + splineValuesOffset[i],
-               pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
+               pmeGpu->common->bsp_mod[i].data(),
+               pmeGpu->common->bsp_mod[i].size() * sizeof(float));
      }
      /* TODO: pin original buffer instead! */
      copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
-                       pmeGpu->staging.h_splineModuli[gridIndex], 0, newSplineValuesSize,
-                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->staging.h_splineModuli[gridIndex],
+                       0,
+                       newSplineValuesSize,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
  }
  
  void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
@@ -218,8 +228,10 @@ void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
  {
      const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
-                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc,
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
+                           newForcesSize,
+                           &pmeGpu->archSpecific->forcesSize,
+                           &pmeGpu->archSpecific->forcesSizeAlloc,
                             pmeGpu->archSpecific->deviceContext_);
      pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
      pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
@@ -234,18 +246,26 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
      float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
-    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
-                       DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
-                       pmeGpu->settings.transferKind, nullptr);
+    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
+                       h_forcesFloat,
+                       0,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
  }
  
  void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
      float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
-    copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
-                         DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
-                         pmeGpu->settings.transferKind, nullptr);
+    copyFromDeviceBuffer(h_forcesFloat,
+                         &pmeGpu->kernelParams->atoms.d_forces,
+                         0,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms,
+                         pmeGpu->archSpecific->pmeStream_,
+                         pmeGpu->settings.transferKind,
+                         nullptr);
  }
  
  void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
@@ -256,19 +276,26 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
      const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
-                           newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize[gridIndex],
+                           newCoefficientsSize,
+                           &pmeGpu->archSpecific->coefficientsSize[gridIndex],
                             &pmeGpu->archSpecific->coefficientsCapacity[gridIndex],
                             pmeGpu->archSpecific->deviceContext_);
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
-                       const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
-                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                       const_cast<float*>(h_coefficients),
+                       0,
+                       pmeGpu->kernelParams->atoms.nAtoms,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
  
      const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
      const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
      if (paddingCount > 0)
      {
-        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], paddingIndex,
-                               paddingCount, pmeGpu->archSpecific->pmeStream_);
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
+                               paddingIndex,
+                               paddingCount,
+                               pmeGpu->archSpecific->pmeStream_);
      }
  }
  
@@ -289,10 +316,15 @@ void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
      const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
      int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
      int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, &currentSizeTemp,
-                           &currentSizeTempAlloc, pmeGpu->archSpecific->deviceContext_);
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
-                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc,
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta,
+                           newSplineDataSize,
+                           &currentSizeTemp,
+                           &currentSizeTempAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta,
+                           newSplineDataSize,
+                           &pmeGpu->archSpecific->splineDataSize,
+                           &pmeGpu->archSpecific->splineDataSizeAlloc,
                             pmeGpu->archSpecific->deviceContext_);
      // the host side reallocation
      if (shouldRealloc)
@@ -317,7 +349,8 @@ void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu)
  {
      const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices,
+                           newIndicesSize,
                             &pmeGpu->archSpecific->gridlineIndicesSize,
                             &pmeGpu->archSpecific->gridlineIndicesSizeAlloc,
                             pmeGpu->archSpecific->deviceContext_);
@@ -347,11 +380,13 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
          if (pmeGpu->archSpecific->performOutOfPlaceFFT)
          {
              /* 2 separate grids */
-            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], newComplexGridSize,
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+                                   newComplexGridSize,
                                     &pmeGpu->archSpecific->complexGridSize[gridIndex],
                                     &pmeGpu->archSpecific->complexGridCapacity[gridIndex],
                                     pmeGpu->archSpecific->deviceContext_);
-            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newRealGridSize,
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex],
+                                   newRealGridSize,
                                     &pmeGpu->archSpecific->realGridSize[gridIndex],
                                     &pmeGpu->archSpecific->realGridCapacity[gridIndex],
                                     pmeGpu->archSpecific->deviceContext_);
@@ -360,7 +395,8 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
          {
              /* A single buffer so that any grid will fit */
              const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
-            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newGridsSize,
+            reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex],
+                                   newGridsSize,
                                     &pmeGpu->archSpecific->realGridSize[gridIndex],
                                     &pmeGpu->archSpecific->realGridCapacity[gridIndex],
                                     pmeGpu->archSpecific->deviceContext_);
@@ -388,7 +424,8 @@ void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
  {
      for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
      {
-        clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+        clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+                               0,
                                 pmeGpu->archSpecific->realGridSize[gridIndex],
                                 pmeGpu->archSpecific->pmeStream_);
      }
@@ -411,12 +448,16 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
      const int newFractShiftsSize = cellCount * (nx + ny + nz);
  
      initParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
-                         &kernelParamsPtr->fractShiftsTableTexture, pmeGpu->common->fsh.data(),
-                         newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
+                         &kernelParamsPtr->fractShiftsTableTexture,
+                         pmeGpu->common->fsh.data(),
+                         newFractShiftsSize,
+                         pmeGpu->archSpecific->deviceContext_);
  
      initParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
-                         &kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
-                         newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
+                         &kernelParamsPtr->gridlineIndicesTableTexture,
+                         pmeGpu->common->nn.data(),
+                         newFractShiftsSize,
+                         pmeGpu->archSpecific->deviceContext_);
  }
  
  void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
@@ -440,16 +481,24 @@ bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
  
  void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, const int gridIndex)
  {
-    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], h_grid, 0,
+    copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+                       h_grid,
+                       0,
                         pmeGpu->archSpecific->realGridSize[gridIndex],
-                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
  }
  
  void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, const int gridIndex)
  {
-    copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+    copyFromDeviceBuffer(h_grid,
+                         &pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+                         0,
                           pmeGpu->archSpecific->realGridSize[gridIndex],
-                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                         pmeGpu->archSpecific->pmeStream_,
+                         pmeGpu->settings.transferKind,
+                         nullptr);
      pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
  }
  
@@ -457,13 +506,27 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
  {
      const size_t splinesCount    = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
      auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
-    copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
-                         pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
-                         0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
-                         pmeGpu->settings.transferKind, nullptr);
+    copyFromDeviceBuffer(pmeGpu->staging.h_dtheta,
+                         &kernelParamsPtr->atoms.d_dtheta,
+                         0,
+                         splinesCount,
+                         pmeGpu->archSpecific->pmeStream_,
+                         pmeGpu->settings.transferKind,
+                         nullptr);
+    copyFromDeviceBuffer(pmeGpu->staging.h_theta,
+                         &kernelParamsPtr->atoms.d_theta,
+                         0,
+                         splinesCount,
+                         pmeGpu->archSpecific->pmeStream_,
+                         pmeGpu->settings.transferKind,
+                         nullptr);
+    copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices,
+                         &kernelParamsPtr->atoms.d_gridlineIndices,
+                         0,
+                         kernelParamsPtr->atoms.nAtoms * DIM,
+                         pmeGpu->archSpecific->pmeStream_,
+                         pmeGpu->settings.transferKind,
+                         nullptr);
  }
  
  void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
@@ -472,22 +535,40 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
      auto*        kernelParamsPtr = pmeGpu->kernelParams.get();
  
      // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
-    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices,
+                           0,
+                           pmeGpu->nAtomsAlloc * DIM,
                             pmeGpu->archSpecific->pmeStream_);
-    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta,
+                           0,
                             pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
                             pmeGpu->archSpecific->pmeStream_);
-    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+    clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta,
+                           0,
                             pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
                             pmeGpu->archSpecific->pmeStream_);
  
-    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
-                       pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
-    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
-                       0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
-                       pmeGpu->settings.transferKind, nullptr);
+    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta,
+                       pmeGpu->staging.h_dtheta,
+                       0,
+                       splinesCount,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
+    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta,
+                       pmeGpu->staging.h_theta,
+                       0,
+                       splinesCount,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
+    copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices,
+                       pmeGpu->staging.h_gridlineIndices,
+                       0,
+                       kernelParamsPtr->atoms.nAtoms * DIM,
+                       pmeGpu->archSpecific->pmeStream_,
+                       pmeGpu->settings.transferKind,
+                       nullptr);
  }
  
  void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
@@ -1238,14 +1319,16 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
          if (spreadCharges)
          {
              timingId  = gtPME_SPLINEANDSPREAD;
-            kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+            kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
+                                                       pmeGpu->settings.threadsPerAtom,
                                                         writeGlobal || (!recalculateSplines),
                                                         pmeGpu->common->ngrids);
          }
          else
          {
              timingId  = gtPME_SPLINE;
-            kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+            kernelPtr = selectSplineKernelPtr(pmeGpu,
+                                              pmeGpu->settings.threadsPerAtom,
                                                writeGlobal || (!recalculateSplines),
                                                pmeGpu->common->ngrids);
          }
@@ -1253,8 +1336,10 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
      else
      {
          timingId  = gtPME_SPREAD;
-        kernelPtr = selectSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
-                                          writeGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
+        kernelPtr = selectSpreadKernelPtr(pmeGpu,
+                                          pmeGpu->settings.threadsPerAtom,
+                                          writeGlobal || (!recalculateSplines),
+                                          pmeGpu->common->ngrids);
      }
  
  
@@ -1263,17 +1348,24 @@ void pme_gpu_spread(const PmeGpu*         pmeGpu,
  #if c_canEmbedBuffers
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
  #else
-    const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
-            &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
-            &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
-            &kernelParamsPtr->grid.d_fractShiftsTable, &kernelParamsPtr->grid.d_gridlineIndicesTable,
-            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
-            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], &kernelParamsPtr->atoms.d_coordinates);
+    const auto kernelArgs =
+            prepareGpuKernelArguments(kernelPtr,
+                                      config,
+                                      kernelParamsPtr,
+                                      &kernelParamsPtr->atoms.d_theta,
+                                      &kernelParamsPtr->atoms.d_dtheta,
+                                      &kernelParamsPtr->atoms.d_gridlineIndices,
+                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                      &kernelParamsPtr->grid.d_fractShiftsTable,
+                                      &kernelParamsPtr->grid.d_gridlineIndicesTable,
+                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                      &kernelParamsPtr->atoms.d_coordinates);
  #endif
  
-    launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent,
-                    "PME spline/spread", kernelArgs);
+    launchGpuKernel(
+            kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      const auto& settings    = pmeGpu->settings;
@@ -1314,9 +1406,13 @@ void pme_gpu_solve(const PmeGpu* pmeGpu,
      float* h_gridFloat = reinterpret_cast<float*>(h_grid);
      if (copyInputAndOutputGrid)
      {
-        copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], h_gridFloat, 0,
+        copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+                           h_gridFloat,
+                           0,
                             pmeGpu->archSpecific->complexGridSize[gridIndex],
-                           pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                           pmeGpu->archSpecific->pmeStream_,
+                           pmeGpu->settings.transferKind,
+                           nullptr);
      }
  
      int majorDim = -1, middleDim = -1, minorDim = -1;
@@ -1400,28 +1496,37 @@ void pme_gpu_solve(const PmeGpu* pmeGpu,
  #if c_canEmbedBuffers
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
  #else
-    const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli[gridIndex],
-            &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
-            &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
+    const auto kernelArgs =
+            prepareGpuKernelArguments(kernelPtr,
+                                      config,
+                                      kernelParamsPtr,
+                                      &kernelParamsPtr->grid.d_splineModuli[gridIndex],
+                                      &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+                                      &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
  #endif
-    launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve",
-                    kernelArgs);
+    launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve", kernelArgs);
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      if (computeEnergyAndVirial)
      {
          copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy[gridIndex],
-                             &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], 0,
-                             c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_,
-                             pmeGpu->settings.transferKind, nullptr);
+                             &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+                             0,
+                             c_virialAndEnergyCount,
+                             pmeGpu->archSpecific->pmeStream_,
+                             pmeGpu->settings.transferKind,
+                             nullptr);
      }
  
      if (copyInputAndOutputGrid)
      {
-        copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid[gridIndex], 0,
+        copyFromDeviceBuffer(h_gridFloat,
+                             &kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+                             0,
                               pmeGpu->archSpecific->complexGridSize[gridIndex],
-                             pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+                             pmeGpu->archSpecific->pmeStream_,
+                             pmeGpu->settings.transferKind,
+                             nullptr);
      }
  }
  
@@ -1551,8 +1656,10 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
  
      int                                timingId = gtPME_GATHER;
      PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
-            selectGatherKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
-                                  readGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
+            selectGatherKernelPtr(pmeGpu,
+                                  pmeGpu->settings.threadsPerAtom,
+                                  readGlobal || (!recalculateSplines),
+                                  pmeGpu->common->ngrids);
      // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
  
      pme_gpu_start_timing(pmeGpu, timingId);
@@ -1570,15 +1677,20 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
  #if c_canEmbedBuffers
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
  #else
-    const auto kernelArgs = prepareGpuKernelArguments(
-            kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
-            &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
-            &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
-            &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta,
-            &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces);
+    const auto kernelArgs =
+            prepareGpuKernelArguments(kernelPtr,
+                                      config,
+                                      kernelParamsPtr,
+                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+                                      &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+                                      &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+                                      &kernelParamsPtr->atoms.d_theta,
+                                      &kernelParamsPtr->atoms.d_dtheta,
+                                      &kernelParamsPtr->atoms.d_gridlineIndices,
+                                      &kernelParamsPtr->atoms.d_forces);
  #endif
-    launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather",
-                    kernelArgs);
+    launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather", kernelArgs);
      pme_gpu_stop_timing(pmeGpu, timingId);
  
      if (pmeGpu->settings.useGpuForceReduction)