#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#if GMX_GPU_CUDA
# include "pme.cuh"
*/
#if GMX_GPU_CUDA
- pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+ pmeGpu->kernelParams->usePipeline = char(false);
+ pmeGpu->kernelParams->pipelineAtomStart = 0;
+ pmeGpu->kernelParams->pipelineAtomEnd = 0;
+ pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
#else
// Use this path for any non-CUDA GPU acceleration
// TODO: is there no really global work size limit in OpenCL?
#elif GMX_GPU_OPENCL
const gmx::FftBackend backend = gmx::FftBackend::Ocl;
#elif GMX_GPU_SYCL
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+ const gmx::FftBackend backend = gmx::FftBackend::SyclMkl;
+# elif GMX_SYCL_HIPSYCL
+ const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
+# else
const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+# endif
#else
GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
const gmx::FftBackend backend = gmx::FftBackend::Count;
return kernelPtr;
}
-void pme_gpu_spread(const PmeGpu* pmeGpu,
- GpuEventSynchronizer* xReadyOnDevice,
- real** h_grids,
- bool computeSplines,
- bool spreadCharges,
- const real lambda)
+void pme_gpu_spread(const PmeGpu* pmeGpu,
+ GpuEventSynchronizer* xReadyOnDevice,
+ real** h_grids,
+ bool computeSplines,
+ bool spreadCharges,
+ const real lambda,
+ const bool useGpuDirectComm,
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(
pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
"inconsistent atom data padding vs. spreading block size");
// Ensure that coordinates are ready on the device before launching spread;
- // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
- // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
- GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
+ // only needed on PP+PME ranks, not on separate PME ranks, in unit tests
+ // as these cases use a single stream (hence xReadyOnDevice == nullptr).
+ GMX_ASSERT(xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
|| pme_gpu_settings(pmeGpu).copyAllOutputs,
"Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
PmeStage timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines);
if (computeSplines)
{
if (spreadCharges)
timingId = PmeStage::SplineAndSpread;
kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
else
timingId = PmeStage::Spline;
kernelPtr = selectSplineKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
}
else
{
timingId = PmeStage::Spread;
- kernelPtr = selectSpreadKernelPtr(pmeGpu,
- pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
- pmeGpu->common->ngrids);
+ kernelPtr = selectSpreadKernelPtr(
+ pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
}
pme_gpu_start_timing(pmeGpu, timingId);
auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+ kernelParamsPtr->usePipeline = char(computeSplines && spreadCharges && useGpuDirectComm
+ && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+ && !writeGlobalOrSaveSplines);
+ if (kernelParamsPtr->usePipeline != 0)
+ {
+ int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+ for (int i = 0; i < numStagesInPipeline; i++)
+ {
+ int senderRank;
+ if (useGpuDirectComm)
+ {
+ senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+ i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ }
+ else
+ {
+ senderRank = i;
+ }
+
+ // set kernel configuration options specific to this stage of the pipeline
+ std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+ pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+ const int blockCount = static_cast<int>(std::ceil(
+ static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+ / atomsPerBlock));
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+ config.gridSize[0] = dimGrid.first;
+ config.gridSize[1] = dimGrid.second;
+ DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
#if c_canEmbedBuffers
- const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs =
- prepareGpuKernelArguments(kernelPtr,
- config,
- kernelParamsPtr,
- &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta,
- &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
- &kernelParamsPtr->atoms.d_coordinates);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(
- kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+ launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+ }
+ // Set dependencies for PME stream on all pipeline streams
+ for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+ {
+ GpuEventSynchronizer event;
+ event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+ }
+ }
+ else // pipelining is not in use
+ {
+ if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+ {
+ pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+ }
+
+#if c_canEmbedBuffers
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+ launchGpuKernel(kernelPtr,
+ config,
+ pmeGpu->archSpecific->pmeStream_,
+ timingEvent,
+ "PME spline/spread",
+ kernelArgs);
+ }
+
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;