/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2016,2017,2018,2019,2020 by the GROMACS development team.
* Copyright (c) 2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
#include <string>
#include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/fft/gpu_3dfft.h"
#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/pmalloc.h"
+#if GMX_GPU_SYCL
+# include "gromacs/gpu_utils/syclutils.h"
+#endif
#include "gromacs/hardware/device_information.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/math/units.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#if GMX_GPU_CUDA
-# include "gromacs/gpu_utils/pmalloc_cuda.h"
-
# include "pme.cuh"
-#elif GMX_GPU_OPENCL
-# include "gromacs/gpu_utils/gmxopencl.h"
-#elif GMX_GPU_SYCL
-# include "gromacs/gpu_utils/syclutils.h"
#endif
-#include "gromacs/ewald/pme.h"
-
-#include "pme_gpu_3dfft.h"
#include "pme_gpu_calculate_splines.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_program_impl.h"
void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
{
- const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
+ const size_t newForcesSize = pmeGpu->nAtomsAlloc;
GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
newForcesSize,
void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
{
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
- float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
- h_forcesFloat,
+ pmeGpu->staging.h_forces.data(),
0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms,
+ pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind,
nullptr);
void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
{
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
- float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
- copyFromDeviceBuffer(h_forcesFloat,
+ copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(),
&pmeGpu->kernelParams->atoms.d_forces,
0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms,
+ pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind,
nullptr);
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
#if GMX_GPU_CUDA
destroyParamLookupTable(&kernelParamsPtr->grid.d_fractShiftsTable,
- kernelParamsPtr->fractShiftsTableTexture);
+ &kernelParamsPtr->fractShiftsTableTexture);
destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
- kernelParamsPtr->gridlineIndicesTableTexture);
+ &kernelParamsPtr->gridlineIndicesTableTexture);
#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
*/
#if GMX_GPU_CUDA
- pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+ pmeGpu->kernelParams->usePipeline = false;
+ pmeGpu->kernelParams->pipelineAtomStart = 0;
+ pmeGpu->kernelParams->pipelineAtomEnd = 0;
+ pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
#else
// Use this path for any non-CUDA GPU acceleration
// TODO: is there no really global work size limit in OpenCL?
if (pme_gpu_settings(pmeGpu).performGPUFFT)
{
pmeGpu->archSpecific->fftSetup.resize(0);
+ const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
+ const bool allocateGrid = false;
+ MPI_Comm comm = MPI_COMM_NULL;
+ std::array<int, 1> gridOffsetsInXForEachRank = { 0 };
+ std::array<int, 1> gridOffsetsInYForEachRank = { 0 };
+#if GMX_GPU_CUDA
+ const gmx::FftBackend backend = gmx::FftBackend::Cufft;
+#elif GMX_GPU_OPENCL
+ const gmx::FftBackend backend = gmx::FftBackend::Ocl;
+#elif GMX_GPU_SYCL
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+ const gmx::FftBackend backend = gmx::FftBackend::SyclMkl;
+# elif GMX_SYCL_HIPSYCL
+ const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
+# else
+ const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+# endif
+#else
+ GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
+ const gmx::FftBackend backend = gmx::FftBackend::Count;
+#endif
+
+ PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu, gridIndex));
+ pmeGpu->archSpecific->fftSetup.push_back(
+ std::make_unique<gmx::Gpu3dFft>(backend,
+ allocateGrid,
+ comm,
+ gridOffsetsInXForEachRank,
+ gridOffsetsInYForEachRank,
+ grid.realGridSize[ZZ],
+ performOutOfPlaceFFT,
+ pmeGpu->archSpecific->deviceContext_,
+ pmeGpu->archSpecific->pmeStream_,
+ grid.realGridSize,
+ grid.realGridSizePadded,
+ grid.complexGridSizePadded,
+ &(grid.d_realGrid[gridIndex]),
+ &(grid.d_fourierGrid[gridIndex])));
}
}
}
pmeGpu->common->nn.insert(pmeGpu->common->nn.end(), pme->nnz, pme->nnz + cellCount * pme->nkz);
pmeGpu->common->runMode = pme->runMode;
pmeGpu->common->isRankPmeOnly = !pme->bPPnode;
- pmeGpu->common->boxScaler = pme->boxScaler;
+ pmeGpu->common->boxScaler = pme->boxScaler.get();
}
/*! \libinternal \brief
* In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
*
* \param[in] pmeGpu The PME GPU data structure.
- * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ * \param[in] pmeStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
*/
-static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PMEStageId)
+static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, PmeStage pmeStageId)
{
CommandEvent* timingEvent = nullptr;
if (pme_gpu_timings_enabled(pmeGpu))
{
- GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
- "Wrong PME GPU timing event index");
- timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+ GMX_ASSERT(pmeStageId < PmeStage::Count, "Wrong PME GPU timing event index");
+ timingEvent = pmeGpu->archSpecific->timingEvents[pmeStageId].fetchNextEvent();
}
return timingEvent;
}
void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, const int grid_index)
{
- int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
+ PmeStage timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? PmeStage::FftTransformR2C
+ : PmeStage::FftTransformC2R;
pme_gpu_start_timing(pmeGpu, timerId);
pmeGpu->archSpecific->fftSetup[grid_index]->perform3dFft(
*
* \return Pointer to CUDA kernel
*/
-static auto selectSplineKernelPtr(const PmeGpu* pmeGpu,
- ThreadsPerAtom threadsPerAtom,
+static auto selectSplineKernelPtr(const PmeGpu* pmeGpu,
+ ThreadsPerAtom threadsPerAtom,
bool gmx_unused writeSplinesToGlobal,
const int numGrids)
{
{
kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Dual;
}
+ else
{
kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Single;
}
return kernelPtr;
}
-void pme_gpu_spread(const PmeGpu* pmeGpu,
- GpuEventSynchronizer* xReadyOnDevice,
- real** h_grids,
- bool computeSplines,
- bool spreadCharges,
- const real lambda)
+void pme_gpu_spread(const PmeGpu* pmeGpu,
+ GpuEventSynchronizer* xReadyOnDevice,
+ real** h_grids,
+ bool computeSplines,
+ bool spreadCharges,
+ const real lambda,
+ const bool useGpuDirectComm,
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(
pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
- int timingId;
+ PmeStage timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines);
if (computeSplines)
{
if (spreadCharges)
{
- timingId = gtPME_SPLINEANDSPREAD;
+ timingId = PmeStage::SplineAndSpread;
kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
else
{
- timingId = gtPME_SPLINE;
+ timingId = PmeStage::Spline;
kernelPtr = selectSplineKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
}
else
{
- timingId = gtPME_SPREAD;
- kernelPtr = selectSpreadKernelPtr(pmeGpu,
- pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
- pmeGpu->common->ngrids);
+ timingId = PmeStage::Spread;
+ kernelPtr = selectSpreadKernelPtr(
+ pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
}
pme_gpu_start_timing(pmeGpu, timingId);
auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+ kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+ && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+ && !writeGlobalOrSaveSplines;
+ if (kernelParamsPtr->usePipeline)
+ {
+ int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+ for (int i = 0; i < numStagesInPipeline; i++)
+ {
+ int senderRank;
+ if (useGpuDirectComm)
+ {
+ senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+ i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ }
+ else
+ {
+ senderRank = i;
+ }
+
+ // set kernel configuration options specific to this stage of the pipeline
+ std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+ pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+ const int blockCount = static_cast<int>(std::ceil(
+ static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+ / atomsPerBlock));
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+ config.gridSize[0] = dimGrid.first;
+ config.gridSize[1] = dimGrid.second;
+ DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
#if c_canEmbedBuffers
- const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs =
- prepareGpuKernelArguments(kernelPtr,
- config,
- kernelParamsPtr,
- &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta,
- &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
- &kernelParamsPtr->atoms.d_coordinates);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+ launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+ }
+ // Set dependencies for PME stream on all pipeline streams
+ for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+ {
+ GpuEventSynchronizer event;
+ event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+ }
+ }
+ else // pipelining is not in use
+ {
+ if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+ {
+ pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+ }
+
+#if c_canEmbedBuffers
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(
- kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+ launchGpuKernel(kernelPtr,
+ config,
+ pmeGpu->archSpecific->pmeStream_,
+ timingEvent,
+ "PME spline/spread",
+ kernelArgs);
+ }
+
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;
/ gridLinesPerBlock;
config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
- int timingId = gtPME_SOLVE;
+ PmeStage timingId = PmeStage::Solve;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (gridOrdering == GridOrdering::YZX)
{
// TODO test different cache configs
- int timingId = gtPME_GATHER;
+ PmeStage timingId = PmeStage::Gather;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
selectGatherKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
}
}
-void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
{
if (pmeGpu && pmeGpu->kernelParams)
{
}
else
{
- return nullptr;
+ return DeviceBuffer<gmx::RVec>{};
}
}