From: Szilárd Páll Date: Thu, 10 Oct 2019 16:10:38 +0000 (+0200) Subject: Link GPU coordinate producer and consumer tasks X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=13f5fac20132d264da019c27d32b1c2502da669f;p=alexxy%2Fgromacs.git Link GPU coordinate producer and consumer tasks The event synchronizer indicating that coordinates are ready in the GPU is now passed to the two tasks that depend on this input: PME and X buffer ops. Both enqueue a wait on the passed event prior to kernel launch to ensure that the coordinates are ready before the kernels start executing. On the separate PME ranks and in tests, as we use a single stream, no synchronization is necessary. With the on-device sync in place, this change also removes the streamSynchronize call from copyCoordinatesToGpu. Refs. #2816, #3126. Change-Id: I3457f01f44ca6d6ad08e0118d8b1def2ab0b381b --- diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index c6ec4e6fca..1af591c33d 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -368,10 +368,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t *GPU_FUNC_ARG * Launches first stage of PME on GPU - spreading kernel. * * \param[in] pme The PME data structure. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. * \param[in] wcycle The wallclock counter. */ -GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), - gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), + GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice), + gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; /*! \brief * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 4685913dbd..8bbd184ac8 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -175,6 +175,7 @@ void pme_gpu_prepare_computation(gmx_pme_t *pme, } void pme_gpu_launch_spread(gmx_pme_t *pme, + GpuEventSynchronizer *xReadyOnDevice, gmx_wallcycle *wcycle) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); @@ -190,7 +191,7 @@ void pme_gpu_launch_spread(gmx_pme_t *pme, const bool spreadCharges = true; wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); - pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges); + pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 7d71782183..e98ae3a223 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1014,11 +1014,12 @@ std::pair inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount return std::pair(colCount, minRowCount); } -void pme_gpu_spread(const PmeGpu *pmeGpu, - int gmx_unused gridIndex, - real *h_grid, - bool computeSplines, - bool spreadCharges) +void pme_gpu_spread(const PmeGpu *pmeGpu, + GpuEventSynchronizer *xReadyOnDevice, + int gmx_unused gridIndex, + real *h_grid, + bool computeSplines, + bool spreadCharges) { GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)"); const auto *kernelParamsPtr = pmeGpu->kernelParams.get(); @@ -1037,6 +1038,16 @@ void pme_gpu_spread(const PmeGpu *pmeGpu, //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define) GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size"); + // Ensure that coordinates are ready on the device before launching spread; + // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests + // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr). + // Note: Consider adding an assertion on xReadyOnDevice when we can detect + // here separate PME ranks. + if (xReadyOnDevice) + { + xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream); + } + const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index bdd92bb83b..0f66915b0d 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -383,17 +383,20 @@ void pme_gpu_get_timings(const PmeGpu *pmeGpu, * A GPU spline computation and charge spreading function. * * \param[in] pmeGpu The PME GPU structure. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; + * can be nullptr when invoked on a separate PME rank or from PME tests. * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0. * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host, * e.g. testing or host-side FFT) * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed. * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. */ -GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), - int GPU_FUNC_ARGUMENT(gridIndex), - real *GPU_FUNC_ARGUMENT(h_grid), - bool GPU_FUNC_ARGUMENT(computeSplines), - bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), + GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice), + int GPU_FUNC_ARGUMENT(gridIndex), + real *GPU_FUNC_ARGUMENT(h_grid), + bool GPU_FUNC_ARGUMENT(computeSplines), + bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM; /*! \libinternal \brief * 3D FFT R2C/C2R routine. diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 10ceb2ce3b..8e5ac1de84 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -644,8 +644,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme, // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested! pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction); stateGpu->copyCoordinatesToGpu(gmx::ArrayRef(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All); + // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream + auto xReadyOnDevice = nullptr; - pme_gpu_launch_spread(pme, wcycle); + pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle); pme_gpu_launch_complex_transforms(pme, wcycle); pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set); output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle); diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index 2153e2c139..1311ab30c4 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -311,8 +311,12 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual break; case CodePath::GPU: - pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges); - break; + { + // no synchronization needed as x is transferred in the PME stream + GpuEventSynchronizer *xReadyOnDevice = nullptr; + pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges); + } + break; default: GMX_THROW(InternalError("Test not implemented for this mode")); diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 4c8323444d..8aa2d195f9 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -613,16 +613,18 @@ static int makePmeFlags(const StepWorkload &stepWork) * \param[in] box The box matrix * \param[in] stepWork Step schedule flags * \param[in] pmeFlags PME flags + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. * \param[in] wcycle The wallcycle structure */ -static inline void launchPmeGpuSpread(gmx_pme_t *pmedata, - const matrix box, - const StepWorkload &stepWork, - int pmeFlags, - gmx_wallcycle_t wcycle) +static inline void launchPmeGpuSpread(gmx_pme_t *pmedata, + const matrix box, + const StepWorkload &stepWork, + int pmeFlags, + GpuEventSynchronizer *xReadyOnDevice, + gmx_wallcycle_t wcycle) { pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction); - pme_gpu_launch_spread(pmedata, wcycle); + pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle); } /*! \brief Launch the FFT and gather stages of PME GPU @@ -1028,9 +1030,12 @@ void do_force(FILE *fplog, stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local); } + const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local, + simulationWork, stepWork) : nullptr; if (useGpuPmeOnThisRank) { - launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle); + launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, + localXReadyOnDevice, wcycle); } /* do gridding for pair search */ @@ -1148,7 +1153,8 @@ void do_force(FILE *fplog, if (useGpuXBufOps == BufferOpsUseGpu::True) { nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false, - stateGpu->getCoordinates()); + stateGpu->getCoordinates(), + localXReadyOnDevice); } else { @@ -1259,7 +1265,9 @@ void do_force(FILE *fplog, stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false, - stateGpu->getCoordinates()); + stateGpu->getCoordinates(), + stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal, + simulationWork, stepWork)); } else { diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 70c73a1657..fd591bc984 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -72,7 +72,6 @@ StatePropagatorDataGpu::Impl::Impl(const void *pmeStream, paddingSize_(paddingSize) { static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); - GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build."); // TODO: Refactor when the StreamManager is introduced. if (GMX_GPU == GMX_GPU_OPENCL) @@ -136,7 +135,6 @@ StatePropagatorDataGpu::Impl::Impl(const void *pmeStream, paddingSize_(paddingSize) { static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths."); - GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build."); if (GMX_GPU == GMX_GPU_OPENCL) { @@ -303,8 +301,6 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef d_x) +void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, + const Nbnxm::AtomLocality locality, + bool fillLocal, + gmx_nbnxn_gpu_t *gpu_nbv, + DeviceBuffer d_x, + GpuEventSynchronizer *xReadyOnDevice) { int gridBegin = 0; @@ -1102,6 +1103,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, fillLocal && g == 0, gpu_nbv, d_x, + xReadyOnDevice, locality, g, gridSet.numColumnsMax()); diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 35d711beff..bb3984d4d3 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -334,12 +334,14 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed. * \param[in,out] gpu_nbv The NBNXM GPU data structure. * \param[in] d_x Coordinates to be copied (in plain rvec format). + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. */ -void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, - Nbnxm::AtomLocality locality, - bool fillLocal, - gmx_nbnxn_gpu_t *gpu_nbv, - DeviceBuffer d_x); +void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, + Nbnxm::AtomLocality locality, + bool fillLocal, + gmx_nbnxn_gpu_t *gpu_nbv, + DeviceBuffer d_x, + GpuEventSynchronizer *xReadyOnDevice); /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well * diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index e6cd8253b7..7b1241e5e2 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -748,6 +748,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, bool setFillerCoords, gmx_nbnxn_gpu_t *nb, DeviceBuffer d_x, + GpuEventSynchronizer *xReadyOnDevice, const Nbnxm::AtomLocality locality, int gridId, int numColumnsMax) @@ -770,6 +771,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, // TODO: This will only work with CUDA GMX_ASSERT(d_x, "Need a valid device pointer"); + // ensure that coordinates are ready on the device before launching the kernel + GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object"); + xReadyOnDevice->enqueueWaitEvent(stream); + KernelLaunchConfig config; config.blockSize[0] = c_bufOpsThreadsPerBlock; config.blockSize[1] = 1; diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 4fb8cbacda..3a796b504e 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -147,16 +147,18 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality loca wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } -void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality, - const bool fillLocal, - DeviceBuffer d_x) +void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality, + const bool fillLocal, + DeviceBuffer d_x, + GpuEventSynchronizer *xReadyOnDevice) { wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, - d_x); + d_x, + xReadyOnDevice); wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index a5ef5baff3..4378663b93 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -280,13 +280,15 @@ struct nonbonded_verlet_t * * The API function for the transformation of the coordinates from one layout to another in the GPU memory. * - * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. - * \param[in] fillLocal If the coordinates for filler particles should be zeroed. - * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed. + * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. + * \param[in] fillLocal If the coordinates for filler particles should be zeroed. + * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. */ - void convertCoordinatesGpu(Nbnxm::AtomLocality locality, - bool fillLocal, - DeviceBuffer d_x); + void convertCoordinatesGpu(Nbnxm::AtomLocality locality, + bool fillLocal, + DeviceBuffer d_x, + GpuEventSynchronizer *xReadyOnDevice); //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index afdafb80ff..1061292b2d 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -235,18 +235,20 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet, * \param[in] setFillerCoords If the filler coordinates are used. * \param[in,out] gpu_nbv The nonbonded data GPU structure. * \param[in] d_x Device-side coordinates in plain rvec format. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. * \param[in] locality Copy coordinates for local or non-local atoms. * \param[in] gridId Index of the grid being converted. * \param[in] numColumnsMax Maximum number of columns in the grid. */ CUDA_FUNC_QUALIFIER -void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, - bool gmx_unused setFillerCoords, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - DeviceBuffer gmx_unused d_x, - Nbnxm::AtomLocality gmx_unused locality, - int gmx_unused gridId, - int gmx_unused numColumnsMax) CUDA_FUNC_TERM; +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, + bool gmx_unused setFillerCoords, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + DeviceBuffer gmx_unused d_x, + GpuEventSynchronizer gmx_unused *xReadyOnDevice, + Nbnxm::AtomLocality gmx_unused locality, + int gmx_unused gridId, + int gmx_unused numColumnsMax) CUDA_FUNC_TERM; /*! \brief Sync the nonlocal stream with dependent tasks in the local queue. * \param[in] nb The nonbonded data GPU structure