#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#if GMX_GPU_CUDA
# include "pme.cuh"
*/
#if GMX_GPU_CUDA
- pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+ pmeGpu->kernelParams->usePipeline = false;
+ pmeGpu->kernelParams->pipelineAtomStart = 0;
+ pmeGpu->kernelParams->pipelineAtomEnd = 0;
+ pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
#else
// Use this path for any non-CUDA GPU acceleration
// TODO: is there no really global work size limit in OpenCL?
return kernelPtr;
}
-void pme_gpu_spread(const PmeGpu* pmeGpu,
- GpuEventSynchronizer* xReadyOnDevice,
- real** h_grids,
- bool computeSplines,
- bool spreadCharges,
- const real lambda)
+void pme_gpu_spread(const PmeGpu* pmeGpu,
+ GpuEventSynchronizer* xReadyOnDevice,
+ real** h_grids,
+ bool computeSplines,
+ bool spreadCharges,
+ const real lambda,
+ const bool useGpuDirectComm,
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(
pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
PmeStage timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines);
if (computeSplines)
{
if (spreadCharges)
timingId = PmeStage::SplineAndSpread;
kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
else
timingId = PmeStage::Spline;
kernelPtr = selectSplineKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
}
else
{
timingId = PmeStage::Spread;
- kernelPtr = selectSpreadKernelPtr(pmeGpu,
- pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
- pmeGpu->common->ngrids);
+ kernelPtr = selectSpreadKernelPtr(
+ pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
}
pme_gpu_start_timing(pmeGpu, timingId);
auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+ kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+ && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+ && !writeGlobalOrSaveSplines;
+ if (kernelParamsPtr->usePipeline)
+ {
+ int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+ for (int i = 0; i < numStagesInPipeline; i++)
+ {
+ int senderRank;
+ if (useGpuDirectComm)
+ {
+ senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+ i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ }
+ else
+ {
+ senderRank = i;
+ }
+
+ // set kernel configuration options specific to this stage of the pipeline
+ std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+ pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+ const int blockCount = static_cast<int>(std::ceil(
+ static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+ / atomsPerBlock));
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+ config.gridSize[0] = dimGrid.first;
+ config.gridSize[1] = dimGrid.second;
+ DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
#if c_canEmbedBuffers
- const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs =
- prepareGpuKernelArguments(kernelPtr,
- config,
- kernelParamsPtr,
- &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta,
- &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
- &kernelParamsPtr->atoms.d_coordinates);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(
- kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+ launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+ }
+ // Set dependencies for PME stream on all pipeline streams
+ for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+ {
+ GpuEventSynchronizer event;
+ event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+ }
+ }
+ else // pipelining is not in use
+ {
+ if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+ {
+ pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+ }
+
+#if c_canEmbedBuffers
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+ launchGpuKernel(kernelPtr,
+ config,
+ pmeGpu->archSpecific->pmeStream_,
+ timingEvent,
+ "PME spline/spread",
+ kernelArgs);
+ }
+
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;