* Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] wcycle The wallclock counter.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
- gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
}
void pme_gpu_launch_spread(gmx_pme_t *pme,
+ GpuEventSynchronizer *xReadyOnDevice,
gmx_wallcycle *wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
const bool spreadCharges = true;
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
- pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+ pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
return std::pair<int, int>(colCount, minRowCount);
}
-void pme_gpu_spread(const PmeGpu *pmeGpu,
- int gmx_unused gridIndex,
- real *h_grid,
- bool computeSplines,
- bool spreadCharges)
+void pme_gpu_spread(const PmeGpu *pmeGpu,
+ GpuEventSynchronizer *xReadyOnDevice,
+ int gmx_unused gridIndex,
+ real *h_grid,
+ bool computeSplines,
+ bool spreadCharges)
{
GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
+ // Ensure that coordinates are ready on the device before launching spread;
+ // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+ // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+ // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+ // here separate PME ranks.
+ if (xReadyOnDevice)
+ {
+ xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+ }
+
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
* A GPU spline computation and charge spreading function.
*
* \param[in] pmeGpu The PME GPU structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
+ * can be nullptr when invoked on a separate PME rank or from PME tests.
* \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
* \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
* e.g. testing or host-side FFT)
* \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
* \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
- int GPU_FUNC_ARGUMENT(gridIndex),
- real *GPU_FUNC_ARGUMENT(h_grid),
- bool GPU_FUNC_ARGUMENT(computeSplines),
- bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ int GPU_FUNC_ARGUMENT(gridIndex),
+ real *GPU_FUNC_ARGUMENT(h_grid),
+ bool GPU_FUNC_ARGUMENT(computeSplines),
+ bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
// or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+ // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
+ auto xReadyOnDevice = nullptr;
- pme_gpu_launch_spread(pme, wcycle);
+ pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
break;
case CodePath::GPU:
- pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
- break;
+ {
+ // no synchronization needed as x is transferred in the PME stream
+ GpuEventSynchronizer *xReadyOnDevice = nullptr;
+ pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+ }
+ break;
default:
GMX_THROW(InternalError("Test not implemented for this mode"));
* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] wcycle The wallcycle structure
*/
-static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
- const matrix box,
- const StepWorkload &stepWork,
- int pmeFlags,
- gmx_wallcycle_t wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
+ const matrix box,
+ const StepWorkload &stepWork,
+ int pmeFlags,
+ GpuEventSynchronizer *xReadyOnDevice,
+ gmx_wallcycle_t wcycle)
{
pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
- pme_gpu_launch_spread(pmedata, wcycle);
+ pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
}
/*! \brief Launch the FFT and gather stages of PME GPU
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
}
+ const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+ simulationWork, stepWork) : nullptr;
if (useGpuPmeOnThisRank)
{
- launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
+ localXReadyOnDevice, wcycle);
}
/* do gridding for pair search */
if (useGpuXBufOps == BufferOpsUseGpu::True)
{
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
- stateGpu->getCoordinates());
+ stateGpu->getCoordinates(),
+ localXReadyOnDevice);
}
else
{
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
- stateGpu->getCoordinates());
+ stateGpu->getCoordinates(),
+ stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+ simulationWork, stepWork));
}
else
{
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
- GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
// TODO: Refactor when the StreamManager is introduced.
if (GMX_GPU == GMX_GPU_OPENCL)
paddingSize_(paddingSize)
{
static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
- GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
if (GMX_GPU == GMX_GPU_OPENCL)
{
if (GMX_GPU == GMX_GPU_CUDA)
{
xReadyOnDevice_[atomLocality].markEvent(commandStream);
- // TODO: Remove When event-based synchronization is introduced
- gpuStreamSynchronize(commandStream);
}
}
//
// TODO: This should be reconsidered to support the halo exchange.
//
+ // In OpenCL no events are used as coordinate sync is not necessary
+ if (GMX_GPU == GMX_GPU_OPENCL)
+ {
+ return nullptr;
+ }
if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
{
return &xUpdatedOnDevice_;
}
/* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
- const Nbnxm::AtomLocality locality,
- bool fillLocal,
- gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> d_x)
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
+ const Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice)
{
int gridBegin = 0;
fillLocal && g == 0,
gpu_nbv,
d_x,
+ xReadyOnDevice,
locality,
g,
gridSet.numColumnsMax());
* \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed.
* \param[in,out] gpu_nbv The NBNXM GPU data structure.
* \param[in] d_x Coordinates to be copied (in plain rvec format).
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
- Nbnxm::AtomLocality locality,
- bool fillLocal,
- gmx_nbnxn_gpu_t *gpu_nbv,
- DeviceBuffer<float> d_x);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet,
+ Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice);
/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
*
bool setFillerCoords,
gmx_nbnxn_gpu_t *nb,
DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice,
const Nbnxm::AtomLocality locality,
int gridId,
int numColumnsMax)
// TODO: This will only work with CUDA
GMX_ASSERT(d_x, "Need a valid device pointer");
+ // ensure that coordinates are ready on the device before launching the kernel
+ GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
+ xReadyOnDevice->enqueueWaitEvent(stream);
+
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
config.blockSize[1] = 1;
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
-void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality,
- const bool fillLocal,
- DeviceBuffer<float> d_x)
+void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality,
+ const bool fillLocal,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice)
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
gpu_nbv,
- d_x);
+ d_x,
+ xReadyOnDevice);
wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
*
* The API function for the transformation of the coordinates from one layout to another in the GPU memory.
*
- * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
- * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
- * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
+ * \param[in] locality Whether coordinates for local or non-local atoms should be transformed.
+ * \param[in] fillLocal If the coordinates for filler particles should be zeroed.
+ * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
*/
- void convertCoordinatesGpu(Nbnxm::AtomLocality locality,
- bool fillLocal,
- DeviceBuffer<float> d_x);
+ void convertCoordinatesGpu(Nbnxm::AtomLocality locality,
+ bool fillLocal,
+ DeviceBuffer<float> d_x,
+ GpuEventSynchronizer *xReadyOnDevice);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
* \param[in] setFillerCoords If the filler coordinates are used.
* \param[in,out] gpu_nbv The nonbonded data GPU structure.
* \param[in] d_x Device-side coordinates in plain rvec format.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
* \param[in] locality Copy coordinates for local or non-local atoms.
* \param[in] gridId Index of the grid being converted.
* \param[in] numColumnsMax Maximum number of columns in the grid.
*/
CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
- bool gmx_unused setFillerCoords,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- DeviceBuffer<float> gmx_unused d_x,
- Nbnxm::AtomLocality gmx_unused locality,
- int gmx_unused gridId,
- int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid,
+ bool gmx_unused setFillerCoords,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ DeviceBuffer<float> gmx_unused d_x,
+ GpuEventSynchronizer gmx_unused *xReadyOnDevice,
+ Nbnxm::AtomLocality gmx_unused locality,
+ int gmx_unused gridId,
+ int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
* \param[in] nb The nonbonded data GPU structure