for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
- c_virialAndEnergyCount, pmeGpu->archSpecific->deviceContext_);
+ c_virialAndEnergyCount,
+ pmeGpu->archSpecific->deviceContext_);
pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy[gridIndex]), energyAndVirialSize);
}
}
{
for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], 0,
- c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
+ 0,
+ c_virialAndEnergyCount,
+ pmeGpu->archSpecific->pmeStream_);
}
}
GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
"Invalid combination of gridIndex and number of grids");
- const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
+ const int splineValuesOffset[DIM] = { 0,
+ pmeGpu->kernelParams->grid.realGridSize[XX],
pmeGpu->kernelParams->grid.realGridSize[XX]
+ pmeGpu->kernelParams->grid.realGridSize[YY] };
memcpy(&pmeGpu->kernelParams->grid.splineValuesOffset, &splineValuesOffset, sizeof(splineValuesOffset));
+ pmeGpu->kernelParams->grid.realGridSize[ZZ];
const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize[gridIndex]);
reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
- newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize[gridIndex],
+ newSplineValuesSize,
+ &pmeGpu->archSpecific->splineValuesSize[gridIndex],
&pmeGpu->archSpecific->splineValuesCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
if (shouldRealloc)
for (int i = 0; i < DIM; i++)
{
memcpy(pmeGpu->staging.h_splineModuli[gridIndex] + splineValuesOffset[i],
- pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
+ pmeGpu->common->bsp_mod[i].data(),
+ pmeGpu->common->bsp_mod[i].size() * sizeof(float));
}
/* TODO: pin original buffer instead! */
copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
- pmeGpu->staging.h_splineModuli[gridIndex], 0, newSplineValuesSize,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->staging.h_splineModuli[gridIndex],
+ 0,
+ newSplineValuesSize,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
{
const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
- &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc,
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
+ newForcesSize,
+ &pmeGpu->archSpecific->forcesSize,
+ &pmeGpu->archSpecific->forcesSizeAlloc,
pmeGpu->archSpecific->deviceContext_);
pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
{
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
- copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
+ h_forcesFloat,
+ 0,
+ DIM * pmeGpu->kernelParams->atoms.nAtoms,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
{
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
- copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyFromDeviceBuffer(h_forcesFloat,
+ &pmeGpu->kernelParams->atoms.d_forces,
+ 0,
+ DIM * pmeGpu->kernelParams->atoms.nAtoms,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
- newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize[gridIndex],
+ newCoefficientsSize,
+ &pmeGpu->archSpecific->coefficientsSize[gridIndex],
&pmeGpu->archSpecific->coefficientsCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
- const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ const_cast<float*>(h_coefficients),
+ 0,
+ pmeGpu->kernelParams->atoms.nAtoms,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
if (paddingCount > 0)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], paddingIndex,
- paddingCount, pmeGpu->archSpecific->pmeStream_);
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
+ paddingIndex,
+ paddingCount,
+ pmeGpu->archSpecific->pmeStream_);
}
}
const bool shouldRealloc = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
int currentSizeTemp = pmeGpu->archSpecific->splineDataSize;
int currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, ¤tSizeTemp,
- ¤tSizeTempAlloc, pmeGpu->archSpecific->deviceContext_);
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
- &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc,
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta,
+ newSplineDataSize,
+ ¤tSizeTemp,
+ ¤tSizeTempAlloc,
+ pmeGpu->archSpecific->deviceContext_);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta,
+ newSplineDataSize,
+ &pmeGpu->archSpecific->splineDataSize,
+ &pmeGpu->archSpecific->splineDataSizeAlloc,
pmeGpu->archSpecific->deviceContext_);
// the host side reallocation
if (shouldRealloc)
{
const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices,
+ newIndicesSize,
&pmeGpu->archSpecific->gridlineIndicesSize,
&pmeGpu->archSpecific->gridlineIndicesSizeAlloc,
pmeGpu->archSpecific->deviceContext_);
if (pmeGpu->archSpecific->performOutOfPlaceFFT)
{
/* 2 separate grids */
- reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], newComplexGridSize,
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+ newComplexGridSize,
&pmeGpu->archSpecific->complexGridSize[gridIndex],
&pmeGpu->archSpecific->complexGridCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
- reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newRealGridSize,
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex],
+ newRealGridSize,
&pmeGpu->archSpecific->realGridSize[gridIndex],
&pmeGpu->archSpecific->realGridCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
{
/* A single buffer so that any grid will fit */
const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
- reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newGridsSize,
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex],
+ newGridsSize,
&pmeGpu->archSpecific->realGridSize[gridIndex],
&pmeGpu->archSpecific->realGridCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
{
for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+ 0,
pmeGpu->archSpecific->realGridSize[gridIndex],
pmeGpu->archSpecific->pmeStream_);
}
const int newFractShiftsSize = cellCount * (nx + ny + nz);
initParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->fractShiftsTableTexture, pmeGpu->common->fsh.data(),
- newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
+ &kernelParamsPtr->fractShiftsTableTexture,
+ pmeGpu->common->fsh.data(),
+ newFractShiftsSize,
+ pmeGpu->archSpecific->deviceContext_);
initParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->gridlineIndicesTableTexture, pmeGpu->common->nn.data(),
- newFractShiftsSize, pmeGpu->archSpecific->deviceContext_);
+ &kernelParamsPtr->gridlineIndicesTableTexture,
+ pmeGpu->common->nn.data(),
+ newFractShiftsSize,
+ pmeGpu->archSpecific->deviceContext_);
}
void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, const int gridIndex)
{
- copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], h_grid, 0,
+ copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+ h_grid,
+ 0,
pmeGpu->archSpecific->realGridSize[gridIndex],
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, const int gridIndex)
{
- copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+ copyFromDeviceBuffer(h_grid,
+ &pmeGpu->kernelParams->grid.d_realGrid[gridIndex],
+ 0,
pmeGpu->archSpecific->realGridSize[gridIndex],
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
}
{
const size_t splinesCount = DIM * pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order;
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
- copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
- copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
- copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
- 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyFromDeviceBuffer(pmeGpu->staging.h_dtheta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ 0,
+ splinesCount,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
+ copyFromDeviceBuffer(pmeGpu->staging.h_theta,
+ &kernelParamsPtr->atoms.d_theta,
+ 0,
+ splinesCount,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
+ copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ 0,
+ kernelParamsPtr->atoms.nAtoms * DIM,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu)
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
// TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, pmeGpu->nAtomsAlloc * DIM,
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices,
+ 0,
+ pmeGpu->nAtomsAlloc * DIM,
pmeGpu->archSpecific->pmeStream_);
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta,
+ 0,
pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
pmeGpu->archSpecific->pmeStream_);
- clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
+ clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta,
+ 0,
pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
pmeGpu->archSpecific->pmeStream_);
- copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
- copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
- copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
- 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta,
+ pmeGpu->staging.h_dtheta,
+ 0,
+ splinesCount,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
+ copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta,
+ pmeGpu->staging.h_theta,
+ 0,
+ splinesCount,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
+ copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices,
+ pmeGpu->staging.h_gridlineIndices,
+ 0,
+ kernelParamsPtr->atoms.nAtoms * DIM,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
if (spreadCharges)
{
timingId = gtPME_SPLINEANDSPREAD;
- kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+ kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
+ pmeGpu->settings.threadsPerAtom,
writeGlobal || (!recalculateSplines),
pmeGpu->common->ngrids);
}
else
{
timingId = gtPME_SPLINE;
- kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+ kernelPtr = selectSplineKernelPtr(pmeGpu,
+ pmeGpu->settings.threadsPerAtom,
writeGlobal || (!recalculateSplines),
pmeGpu->common->ngrids);
}
else
{
timingId = gtPME_SPREAD;
- kernelPtr = selectSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
+ kernelPtr = selectSpreadKernelPtr(pmeGpu,
+ pmeGpu->settings.threadsPerAtom,
+ writeGlobal || (!recalculateSplines),
+ pmeGpu->common->ngrids);
}
#if c_canEmbedBuffers
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs = prepareGpuKernelArguments(
- kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->grid.d_fractShiftsTable, &kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], &kernelParamsPtr->atoms.d_coordinates);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent,
- "PME spline/spread", kernelArgs);
+ launchGpuKernel(
+ kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;
float* h_gridFloat = reinterpret_cast<float*>(h_grid);
if (copyInputAndOutputGrid)
{
- copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], h_gridFloat, 0,
+ copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+ h_gridFloat,
+ 0,
pmeGpu->archSpecific->complexGridSize[gridIndex],
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
int majorDim = -1, middleDim = -1, minorDim = -1;
#if c_canEmbedBuffers
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs = prepareGpuKernelArguments(
- kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli[gridIndex],
- &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
- &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->grid.d_splineModuli[gridIndex],
+ &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+ &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
#endif
- launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve",
- kernelArgs);
+ launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
if (computeEnergyAndVirial)
{
copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy[gridIndex],
- &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], 0,
- c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+ 0,
+ c_virialAndEnergyCount,
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
if (copyInputAndOutputGrid)
{
- copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid[gridIndex], 0,
+ copyFromDeviceBuffer(h_gridFloat,
+ &kernelParamsPtr->grid.d_fourierGrid[gridIndex],
+ 0,
pmeGpu->archSpecific->complexGridSize[gridIndex],
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind,
+ nullptr);
}
}
int timingId = gtPME_GATHER;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
- selectGatherKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
- readGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
+ selectGatherKernelPtr(pmeGpu,
+ pmeGpu->settings.threadsPerAtom,
+ readGlobal || (!recalculateSplines),
+ pmeGpu->common->ngrids);
// TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
pme_gpu_start_timing(pmeGpu, timingId);
#if c_canEmbedBuffers
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs = prepareGpuKernelArguments(
- kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta,
- &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->atoms.d_forces);
#endif
- launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather",
- kernelArgs);
+ launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
if (pmeGpu->settings.useGpuForceReduction)