X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=blobdiff_plain;f=src%2Fgromacs%2Fewald%2Fpme_gpu_internal.cpp;h=2f7da67a3a0524ec5a60e4761c7cbd361b733e0d;hb=c088e63019ebc68760d35c3bc916015864aa8e89;hp=96dc1f4db95a1b1953fca13a89f6c664a50e5e54;hpb=f59f6ec6e04f4a3732876c2c76250b8444fdcb69;p=alexxy%2Fgromacs.git diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 96dc1f4db9..2f7da67a3a 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -75,6 +75,7 @@ #include "gromacs/utility/logger.h" #include "gromacs/utility/stringutil.h" #include "gromacs/ewald/pme.h" +#include "gromacs/ewald/pme_coordinate_receiver_gpu.h" #if GMX_GPU_CUDA # include "pme.cuh" @@ -593,7 +594,10 @@ static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceCon */ #if GMX_GPU_CUDA - pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0]; + pmeGpu->kernelParams->usePipeline = false; + pmeGpu->kernelParams->pipelineAtomStart = 0; + pmeGpu->kernelParams->pipelineAtomEnd = 0; + pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0]; #else // Use this path for any non-CUDA GPU acceleration // TODO: is there no really global work size limit in OpenCL? @@ -1276,12 +1280,14 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, return kernelPtr; } -void pme_gpu_spread(const PmeGpu* pmeGpu, - GpuEventSynchronizer* xReadyOnDevice, - real** h_grids, - bool computeSplines, - bool spreadCharges, - const real lambda) +void pme_gpu_spread(const PmeGpu* pmeGpu, + GpuEventSynchronizer* xReadyOnDevice, + real** h_grids, + bool computeSplines, + bool spreadCharges, + const real lambda, + const bool useGpuDirectComm, + gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu) { GMX_ASSERT( pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, @@ -1350,6 +1356,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, PmeStage timingId; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; + const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines); if (computeSplines) { if (spreadCharges) @@ -1357,7 +1364,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, timingId = PmeStage::SplineAndSpread; kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), + writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } else @@ -1365,43 +1372,116 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, timingId = PmeStage::Spline; kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), + writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } } else { timingId = PmeStage::Spread; - kernelPtr = selectSpreadKernelPtr(pmeGpu, - pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), - pmeGpu->common->ngrids); + kernelPtr = selectSpreadKernelPtr( + pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } pme_gpu_start_timing(pmeGpu, timingId); auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + + kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm + && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1) + && !writeGlobalOrSaveSplines; + if (kernelParamsPtr->usePipeline) + { + int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); + + for (int i = 0; i < numStagesInPipeline; i++) + { + int senderRank; + if (useGpuDirectComm) + { + senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank( + i, *(pmeCoordinateReceiverGpu->ppCommStream(i))); + } + else + { + senderRank = i; + } + + // set kernel configuration options specific to this stage of the pipeline + std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) = + pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank); + const int blockCount = static_cast(std::ceil( + static_cast(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart) + / atomsPerBlock)); + auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); + config.gridSize[0] = dimGrid.first; + config.gridSize[1] = dimGrid.second; + DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank); + + #if c_canEmbedBuffers - const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); #else - const auto kernelArgs = - prepareGpuKernelArguments(kernelPtr, - config, - kernelParamsPtr, - &kernelParamsPtr->atoms.d_theta, - &kernelParamsPtr->atoms.d_dtheta, - &kernelParamsPtr->atoms.d_gridlineIndices, - &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], - &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], - &kernelParamsPtr->grid.d_fractShiftsTable, - &kernelParamsPtr->grid.d_gridlineIndicesTable, - &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], - &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], - &kernelParamsPtr->atoms.d_coordinates); + const auto kernelArgs = + prepareGpuKernelArguments(kernelPtr, + config, + kernelParamsPtr, + &kernelParamsPtr->atoms.d_theta, + &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->grid.d_fractShiftsTable, + &kernelParamsPtr->grid.d_gridlineIndicesTable, + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], + &kernelParamsPtr->atoms.d_coordinates); #endif - launchGpuKernel( - kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs); + launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs); + } + // Set dependencies for PME stream on all pipeline streams + for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++) + { + GpuEventSynchronizer event; + event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i))); + event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_); + } + } + else // pipelining is not in use + { + if (useGpuDirectComm) // Sync all PME-PP communications to PME stream + { + pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_); + } + +#if c_canEmbedBuffers + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); +#else + const auto kernelArgs = + prepareGpuKernelArguments(kernelPtr, + config, + kernelParamsPtr, + &kernelParamsPtr->atoms.d_theta, + &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->grid.d_fractShiftsTable, + &kernelParamsPtr->grid.d_gridlineIndicesTable, + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], + &kernelParamsPtr->atoms.d_coordinates); +#endif + + launchGpuKernel(kernelPtr, + config, + pmeGpu->archSpecific->pmeStream_, + timingEvent, + "PME spline/spread", + kernelArgs); + } + pme_gpu_stop_timing(pmeGpu, timingId); const auto& settings = pmeGpu->settings;