From 77857c59611ace933fb049f6e43bc9e7e50b1c1c Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Fri, 27 Sep 2019 22:29:30 +0200 Subject: [PATCH] Pass the GPU streams to StatePropagatorDataGpu constructor Now the StatePropagatorDataGpu has a local copy of all GPU streams and manages the update stream. This will allow to select the specific stream for a specific copy event in the follow-ups. The update stream is now created in the constructor of the StatePropagatorDataGPU object, which is a temporary solution until there is a separate device stream manager (#3115). Notes: - The current implementation where StatePropagatorDataGpu is also used on PME-only ranks, where many of the streams do not exist, without any restriction on the methods which would require these streams is a weakness of the design that will be dealt with in follow-up - The OpenCL builds unconditionally use PME stream/context, since for these this object is only used when the initial coordinates are copied. - The update stream is created in the constructor, whereas the rest of the streams is passed as arguments. This asymmentry will be removed with introduction of the centralized management of context/streams. Refs. #2816. Change-Id: Ia9b1cabd1d3d4942dba8465c716bf644037581e7 --- src/gromacs/ewald/pme_only.cpp | 11 ++- src/gromacs/ewald/tests/pmegathertest.cpp | 7 +- .../ewald/tests/pmesplinespreadtest.cpp | 8 +- src/gromacs/ewald/tests/pmetestcommon.cpp | 10 +-- src/gromacs/ewald/tests/pmetestcommon.h | 2 +- src/gromacs/mdlib/update_constrain_cuda.h | 3 + .../mdlib/update_constrain_cuda_impl.cpp | 5 ++ .../mdlib/update_constrain_cuda_impl.cu | 10 +++ .../mdlib/update_constrain_cuda_impl.h | 4 + src/gromacs/mdrun/md.cpp | 6 +- src/gromacs/mdrun/runner.cpp | 34 ++++--- .../mdtypes/state_propagator_data_gpu.h | 25 ++++-- .../state_propagator_data_gpu_impl.cpp | 13 +-- .../mdtypes/state_propagator_data_gpu_impl.h | 67 +++++++++----- .../state_propagator_data_gpu_impl_gpu.cpp | 89 +++++++++++++------ 15 files changed, 198 insertions(+), 96 deletions(-) diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 01518afbb7..6e202ad32f 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -556,8 +556,15 @@ int gmx_pmeonly(struct gmx_pme_t *pme, changePinningPolicy(&pme_pp->x, pme_get_pinning_policy()); } - // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds - auto stateGpu = std::make_unique(commandStream, deviceContext, GpuApiCallBehavior::Sync, paddingSize); + std::unique_ptr stateGpu; + if (useGpuForPme) + { + // TODO: The local and non-local nonbonded streams are passed as nullptrs, since they will be not used for the GPU buffer + // management in PME only ranks. Make the constructor safer. + stateGpu = std::make_unique(commandStream, nullptr, nullptr, + deviceContext, GpuApiCallBehavior::Sync, paddingSize); + } + clear_nrnb(mynrnb); diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp index d2afc1e324..077970b4a9 100644 --- a/src/gromacs/ewald/tests/pmegathertest.cpp +++ b/src/gromacs/ewald/tests/pmegathertest.cpp @@ -407,9 +407,10 @@ class PmeGatherTest : public ::testing::TestWithParam (inputForceTreatment == PmeForceOutputHandling::ReduceWithInput) ? "with reduction" : "without reduction" )); - PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box); - StatePropagatorDataGpu stateGpu = makeStatePropagatorDataGpu(*pmeSafe.get()); - pmeInitAtoms(pmeSafe.get(), &stateGpu, codePath, inputAtomData.coordinates, inputAtomData.charges); + PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box); + std::unique_ptr stateGpu = (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr; + + pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates, inputAtomData.charges); /* Setting some more inputs */ pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues); diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp index 136c08b697..4a30d4d1c5 100644 --- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp +++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp @@ -122,7 +122,6 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParamgetHardwareContexts()) { - std::shared_ptr stateGpu; CodePath codePath = context->getCodePath(); const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath); if (!supportedInput) @@ -146,9 +145,10 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParamgetDeviceInfo(), context->getPmeGpuProgram(), box); - StatePropagatorDataGpu stateGpu = makeStatePropagatorDataGpu(*pmeSafe.get()); - pmeInitAtoms(pmeSafe.get(), &stateGpu, codePath, coordinates, charges); + PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(), context->getPmeGpuProgram(), box); + std::unique_ptr stateGpu = (codePath == CodePath::GPU) ? makeStatePropagatorDataGpu(*pmeSafe.get()) : nullptr; + + pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges); const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified); const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified); diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index c3e538eb00..c82c0c685d 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -168,14 +168,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, } //! Make a GPU state-propagator manager -StatePropagatorDataGpu +std::unique_ptr makeStatePropagatorDataGpu(const gmx_pme_t &pme) { // TODO: Pin the host buffer and use async memory copies - return StatePropagatorDataGpu(pme_gpu_get_device_stream(&pme), - pme_gpu_get_device_context(&pme), - GpuApiCallBehavior::Sync, - pme_gpu_get_padding_size(&pme)); + return std::make_unique(pme_gpu_get_device_stream(&pme), nullptr, nullptr, + pme_gpu_get_device_context(&pme), + GpuApiCallBehavior::Sync, + pme_gpu_get_padding_size(&pme)); } //! PME initialization with atom data diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h index dd27865ab7..0f00f57e38 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.h +++ b/src/gromacs/ewald/tests/pmetestcommon.h @@ -134,7 +134,7 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}}, real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F); //! Make a GPU state-propagator manager -StatePropagatorDataGpu +std::unique_ptr makeStatePropagatorDataGpu(const gmx_pme_t &pme); //! PME initialization with atom data and system box void pmeInitAtoms(gmx_pme_t *pme, diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h index 25c9e7af38..f2035c687f 100644 --- a/src/gromacs/mdlib/update_constrain_cuda.h +++ b/src/gromacs/mdlib/update_constrain_cuda.h @@ -128,6 +128,9 @@ class UpdateConstrainCuda */ void setPbc(const t_pbc *pbc); + /*! \brief Synchronize the device stream. + */ + void synchronizeStream(); private: class Impl; diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp index b5823ed97f..f8f9c45a52 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp @@ -93,6 +93,11 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc) GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); } +void UpdateConstrainCuda::synchronizeStream() +{ + GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); +} + } // namespace gmx #endif /* GMX_GPU != GMX_GPU_CUDA */ diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu index b22f365358..e466772094 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu @@ -162,6 +162,11 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc) settleCuda_->setPbc(pbc); } +void UpdateConstrainCuda::Impl::synchronizeStream() +{ + gpuStreamSynchronize(commandStream_); +} + UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir, const gmx_mtop_t &mtop, const void *commandStream) @@ -201,4 +206,9 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc) impl_->setPbc(pbc); } +void UpdateConstrainCuda::synchronizeStream() +{ + impl_->synchronizeStream(); +} + } //namespace gmx diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h index 652dd84eb6..85c6fb0dc4 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.h +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h @@ -132,6 +132,10 @@ class UpdateConstrainCuda::Impl */ void setPbc(const t_pbc *pbc); + /*! \brief Synchronize the device stream. + */ + void synchronizeStream(); + private: //! CUDA stream diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 99f2a4f0de..2cebfd7920 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -352,7 +352,7 @@ void gmx::LegacySimulator::do_md() GMX_LOG(mdlog.info).asParagraph(). appendText("Updating coordinates on the GPU."); } - integrator = std::make_unique(*ir, *top_global, nullptr); + integrator = std::make_unique(*ir, *top_global, fr->stateGpu->getUpdateStream()); } if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) @@ -1241,7 +1241,9 @@ void gmx::LegacySimulator::do_md() doPressureCouple, ir->nstpcouple*ir->delta_t, M); stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All); - stateGpu->synchronizeStream(); + // Synchronize the update stream. + // TODO: Replace with event-based synchronization. + integrator->synchronizeStream(); } else { diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index f7f24405b5..92057c8320 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1501,26 +1501,32 @@ int Mdrunner::mdrunner() fcd->orires.nr != 0, fcd->disres.nsystems != 0); - const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr; - const void *deviceContext = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr; - const int paddingSize = pme_gpu_get_padding_size(fr->pmedata); - - const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible( + const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible( false, inputrec, doRerun, vsite.get(), ms, replExParams, fcd, static_cast(filenames.size()), filenames.data(), &observablesHistory, membed); - const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr); - GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync; + const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr); - // We initialize GPU state even for the CPU runs so we will have a more verbose - // error if someone will try accessing it from the CPU codepath - gmx::StatePropagatorDataGpu stateGpu(commandStream, - deviceContext, - transferKind, - paddingSize); - fr->stateGpu = &stateGpu; + std::unique_ptr stateGpu; + if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || useGpuForUpdate)) + { + const void *pmeStream = pme_gpu_get_device_stream(fr->pmedata); + const void *localStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::Local) : nullptr; + const void *nonLocalStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, Nbnxm::InteractionLocality::NonLocal) : nullptr; + const void *deviceContext = pme_gpu_get_device_context(fr->pmedata); + const int paddingSize = pme_gpu_get_padding_size(fr->pmedata); + GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync; + + stateGpu = std::make_unique(pmeStream, + localStream, + nonLocalStream, + deviceContext, + transferKind, + paddingSize); + fr->stateGpu = stateGpu.get(); + } // TODO This is not the right place to manage the lifetime of // this data structure, but currently it's the easiest way to diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 212fde53dd..ff43c4a807 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -96,12 +96,16 @@ class StatePropagatorDataGpu * \todo A DeviceContext object is visible in CPU parts of the code so we * can stop passing a void*. * - * \param[in] commandStream GPU stream, nullptr allowed. - * \param[in] deviceContext GPU context, nullptr allowed. - * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). - * \param[in] paddingSize Padding size for coordinates buffer. - */ - StatePropagatorDataGpu(const void *commandStream, + * \param[in] pmeStream Device PME stream, nullptr allowed. + * \param[in] localStream Device NBNXM local stream, nullptr allowed. + * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed. + * \param[in] deviceContext Device context, nullptr allowed. + * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). + * \param[in] paddingSize Padding size for coordinates buffer. + */ + StatePropagatorDataGpu(const void *pmeStream, + const void *localStream, + const void *nonLocalStream, const void *deviceContext, GpuApiCallBehavior transferKind, int paddingSize); @@ -202,9 +206,14 @@ class StatePropagatorDataGpu */ void copyForcesFromGpu(gmx::ArrayRef h_f, AtomLocality atomLocality); - /*! \brief Synchronize the underlying GPU stream + + /*! \brief Getter for the update stream. + * + * \todo This is temporary here, until the management of this stream is taken over. + * + * \returns The device command stream to use in update-constraints. */ - void synchronizeStream(); + void* getUpdateStream(); /*! \brief Getter for the number of local atoms. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index d66cfc552f..6a7dd64730 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -54,10 +54,12 @@ class StatePropagatorDataGpu::Impl { }; -StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* commandStream */, - const void * /* deviceContext */, - GpuApiCallBehavior /* transferKind */, - int /* paddingSize */) +StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* pmeStream */, + const void * /* localStream */, + const void * /* nonLocalStream */, + const void * /* deviceContext */, + GpuApiCallBehavior /* transferKind */, + int /* paddingSize */) : impl_(nullptr) { } @@ -136,9 +138,10 @@ void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef /* h_f GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); } -void StatePropagatorDataGpu::synchronizeStream() +void* StatePropagatorDataGpu::getUpdateStream() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return nullptr; } int StatePropagatorDataGpu::numAtomsLocal() diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index f32d6df9de..16aead662a 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -80,12 +80,16 @@ class StatePropagatorDataGpu::Impl * \todo A DeviceContext object is visible in CPU parts of the code so we * can stop passing a void*. * - * \param[in] commandStream GPU stream, nullptr allowed. - * \param[in] deviceContext GPU context, nullptr allowed. - * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). - * \param[in] paddingSize Padding size for coordinates buffer. + * \param[in] pmeStream Device PME stream, nullptr allowed. + * \param[in] localStream Device NBNXM local stream, nullptr allowed. + * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed. + * \param[in] deviceContext Device context, nullptr allowed. + * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). + * \param[in] paddingSize Padding size for coordinates buffer. */ - Impl(const void *commandStream, + Impl(const void *pmeStream, + const void *localStream, + const void *nonLocalStream, const void *deviceContext, GpuApiCallBehavior transferKind, int paddingSize); @@ -98,6 +102,9 @@ class StatePropagatorDataGpu::Impl * The coordinates buffer is reallocated with the padding added at the end. The * size of padding is set by the constructor. * + * \note The PME requires clearing of the padding, which is done in the pmeStream_. + * Hence the pmeStream_ should be created in the gpuContext_. + * * \param[in] numAtomsLocal Number of atoms in local domain. * \param[in] numAtomsAll Total number of atoms to handle. */ @@ -184,9 +191,13 @@ class StatePropagatorDataGpu::Impl void copyForcesFromGpu(gmx::ArrayRef h_f, AtomLocality atomLocality); - /*! \brief Synchronize the underlying GPU stream + /*! \brief Getter for the update stream. + * + * \todo This is temporary here, until the management of this stream is taken over. + * + * \returns The device command stream to use in update-constraints. */ - void synchronizeStream(); + void* getUpdateStream(); /*! \brief Getter for the number of local atoms. * @@ -202,10 +213,14 @@ class StatePropagatorDataGpu::Impl private: - /*! \brief GPU stream. - * \todo The stream should be set to non-nullptr once the synchronization points are restored - */ - CommandStream commandStream_ = nullptr; + //! GPU PME stream. + CommandStream pmeStream_ = nullptr; + //! GPU NBNXM local stream. + CommandStream localStream_ = nullptr; + //! GPU NBNXM non-local stream + CommandStream nonLocalStream_ = nullptr; + //! GPU Update-constreaints stream. + CommandStream updateStream_ = nullptr; /*! \brief GPU context (for OpenCL builds) * \todo Make a Context class usable in CPU code */ @@ -245,27 +260,31 @@ class StatePropagatorDataGpu::Impl * * \todo Template on locality. * - * \param[in,out] d_data Device-side buffer. - * \param[in,out] h_data Host-side buffer. - * \param[in] dataSize Device-side data allocation size. - * \param[in] atomLocality If all, local or non-local ranges should be copied. + * \param[out] d_data Device-side buffer. + * \param[in] h_data Host-side buffer. + * \param[in] dataSize Device-side data allocation size. + * \param[in] atomLocality If all, local or non-local ranges should be copied. + * \param[in] commandStream GPU stream to execute copy in. */ - void copyToDevice(DeviceBuffer d_data, - gmx::ArrayRef h_data, - int dataSize, - AtomLocality atomLocality); + void copyToDevice(DeviceBuffer d_data, + const gmx::ArrayRef h_data, + int dataSize, + AtomLocality atomLocality, + CommandStream commandStream); /*! \brief Performs the copy of data from device to host buffer. * - * \param[in,out] h_data Host-side buffer. - * \param[in,out] d_data Device-side buffer. - * \param[in] dataSize Device-side data allocation size. - * \param[in] atomLocality If all, local or non-local ranges should be copied. + * \param[out] h_data Host-side buffer. + * \param[in] d_data Device-side buffer. + * \param[in] dataSize Device-side data allocation size. + * \param[in] atomLocality If all, local or non-local ranges should be copied. + * \param[in] commandStream GPU stream to execute copy in. */ void copyFromDevice(gmx::ArrayRef h_data, DeviceBuffer d_data, int dataSize, - AtomLocality atomLocality); + AtomLocality atomLocality, + CommandStream commandStream); }; } // namespace gmx diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 89446f2477..3cc0bb7653 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -62,8 +62,10 @@ namespace gmx { -StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream, - gmx_unused const void *deviceContext, +StatePropagatorDataGpu::Impl::Impl(const void *pmeStream, + const void *localStream, + const void *nonLocalStream, + const void *deviceContext, GpuApiCallBehavior transferKind, int paddingSize) : transferKind_(transferKind), @@ -72,18 +74,29 @@ StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream, GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build."); - // Set the stream-context pair for the OpenCL builds, - // use the nullptr stream for CUDA builds -#if GMX_GPU == GMX_GPU_OPENCL - if (commandStream != nullptr) + if (pmeStream != nullptr) + { + pmeStream_ = *static_cast(pmeStream); + } + if (localStream != nullptr) + { + localStream_ = *static_cast(localStream); + } + if (nonLocalStream != nullptr) { - commandStream_ = *static_cast(commandStream); + nonLocalStream_ = *static_cast(nonLocalStream); } +// The OpenCL build will never use the updateStream +// TODO: The update stream should be created only when it is needed. +#if GMX_GPU == GMX_GPU_CUDA + cudaError_t stat; + stat = cudaStreamCreate(&updateStream_); + CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); +#endif if (deviceContext != nullptr) { deviceContext_ = *static_cast(deviceContext); } -#endif } @@ -114,7 +127,10 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_; if (paddingAllocationSize > 0) { - clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_); + // The PME stream is used here because: + // 1. The padding clearing is only needed by PME. + // 2. It is the stream that is created in the PME OpenCL context. + clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, pmeStream_); } reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_); @@ -151,11 +167,16 @@ std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer d_data, const gmx::ArrayRef h_data, int dataSize, - AtomLocality atomLocality) + AtomLocality atomLocality, + CommandStream commandStream) { #if GMX_GPU == GMX_GPU_OPENCL GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds."); + // The PME stream is used for OpenCL builds, because it is the context that it associated with the + // PME task which requires the coordinates managed here in OpenCL. + // TODO: This will have to be changed when the OpenCL implementation will be extended. + commandStream = pmeStream_; #endif GMX_UNUSED_VALUE(dataSize); @@ -173,21 +194,25 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); - // TODO: Use the proper stream copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), elementsStartAt, numElementsToCopy, - commandStream_, transferKind_, nullptr); + commandStream, transferKind_, nullptr); } } void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_data, DeviceBuffer d_data, int dataSize, - AtomLocality atomLocality) + AtomLocality atomLocality, + CommandStream commandStream) { #if GMX_GPU == GMX_GPU_OPENCL GMX_ASSERT(deviceContext_ != nullptr, "GPU context should be set in OpenCL builds."); + // The PME stream is used for OpenCL builds, because it is the context that it associated with the + // PME task which requires the coordinates managed here in OpenCL. + // TODO: This will have to be changed when the OpenCL implementation will be extended. + commandStream = pmeStream_; #endif GMX_UNUSED_VALUE(dataSize); @@ -205,11 +230,9 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_da GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); - // TODO: Use the proper stream copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, elementsStartAt, numElementsToCopy, - commandStream_, transferKind_, nullptr); - + commandStream, transferKind_, nullptr); } } @@ -221,13 +244,15 @@ DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, AtomLocality atomLocality) { - copyToDevice(d_x_, h_x, d_xSize_, atomLocality); + // TODO: Use the correct stream + copyToDevice(d_x_, h_x, d_xSize_, atomLocality, nullptr); } void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality) { - copyFromDevice(h_x, d_x_, d_xSize_, atomLocality); + // TODO: Use the correct stream + copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, nullptr); } @@ -239,13 +264,15 @@ DeviceBuffer StatePropagatorDataGpu::Impl::getVelocities() void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef h_v, AtomLocality atomLocality) { - copyToDevice(d_v_, h_v, d_vSize_, atomLocality); + // TODO: Use the correct stream + copyToDevice(d_v_, h_v, d_vSize_, atomLocality, nullptr); } void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef h_v, AtomLocality atomLocality) { - copyFromDevice(h_v, d_v_, d_vSize_, atomLocality); + // TODO: Use the correct stream + copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, nullptr); } @@ -257,18 +284,20 @@ DeviceBuffer StatePropagatorDataGpu::Impl::getForces() void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality) { - copyToDevice(d_f_, h_f, d_fSize_, atomLocality); + // TODO: Use the correct stream + copyToDevice(d_f_, h_f, d_fSize_, atomLocality, nullptr); } void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef h_f, AtomLocality atomLocality) { - copyFromDevice(h_f, d_f_, d_fSize_, atomLocality); + // TODO: Use the correct stream + copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, nullptr); } -void StatePropagatorDataGpu::Impl::synchronizeStream() +void* StatePropagatorDataGpu::Impl::getUpdateStream() { - gpuStreamSynchronize(commandStream_); + return updateStream_; } int StatePropagatorDataGpu::Impl::numAtomsLocal() @@ -283,11 +312,15 @@ int StatePropagatorDataGpu::Impl::numAtomsAll() -StatePropagatorDataGpu::StatePropagatorDataGpu(const void *commandStream, +StatePropagatorDataGpu::StatePropagatorDataGpu(const void *pmeStream, + const void *localStream, + const void *nonLocalStream, const void *deviceContext, GpuApiCallBehavior transferKind, int paddingSize) - : impl_(new Impl(commandStream, + : impl_(new Impl(pmeStream, + localStream, + nonLocalStream, deviceContext, transferKind, paddingSize)) @@ -365,9 +398,9 @@ void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef h_f, return impl_->copyForcesFromGpu(h_f, atomLocality); } -void StatePropagatorDataGpu::synchronizeStream() +void* StatePropagatorDataGpu::getUpdateStream() { - return impl_->synchronizeStream(); + return impl_->getUpdateStream(); } int StatePropagatorDataGpu::numAtomsLocal() -- 2.22.0