From 8eadec224efff46cb8d0538884ce7d3c1d11c788 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 18 Feb 2020 01:36:38 +0100 Subject: [PATCH] Make DeviceStream into a class Refs #3314 Refs #3311 Change-Id: Ic270864f0e82af63f91a91c9951bf678795680fa --- src/gromacs/domdec/domdec.cpp | 4 +- src/gromacs/domdec/domdec.h | 5 +- src/gromacs/domdec/gpuhaloexchange.h | 5 +- src/gromacs/domdec/gpuhaloexchange_impl.cpp | 4 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 30 ++-- src/gromacs/domdec/gpuhaloexchange_impl.cuh | 8 +- src/gromacs/ewald/pme.h | 3 +- .../ewald/pme_coordinate_receiver_gpu.h | 3 +- .../pme_coordinate_receiver_gpu_impl.cpp | 2 +- .../ewald/pme_coordinate_receiver_gpu_impl.cu | 8 +- .../ewald/pme_coordinate_receiver_gpu_impl.h | 4 +- src/gromacs/ewald/pme_force_sender_gpu.h | 4 +- .../ewald/pme_force_sender_gpu_impl.cpp | 2 +- .../ewald/pme_force_sender_gpu_impl.cu | 8 +- src/gromacs/ewald/pme_force_sender_gpu_impl.h | 4 +- src/gromacs/ewald/pme_gpu.cpp | 2 +- src/gromacs/ewald/pme_gpu_3dfft.cu | 2 +- src/gromacs/ewald/pme_gpu_3dfft.h | 4 +- src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp | 10 +- src/gromacs/ewald/pme_gpu_internal.cpp | 95 ++++++------- src/gromacs/ewald/pme_gpu_internal.h | 10 +- src/gromacs/ewald/pme_gpu_timings.cpp | 4 +- src/gromacs/ewald/pme_gpu_types_host_impl.h | 5 +- src/gromacs/ewald/pme_only.cpp | 8 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 10 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 2 +- src/gromacs/gpu_utils/CMakeLists.txt | 6 + src/gromacs/gpu_utils/cudautils.cuh | 18 +-- src/gromacs/gpu_utils/device_stream.cpp | 58 ++++++++ src/gromacs/gpu_utils/device_stream.cu | 122 ++++++++++++++++ src/gromacs/gpu_utils/device_stream.h | 134 ++++++++++++++++++ src/gromacs/gpu_utils/device_stream_ocl.cpp | 97 +++++++++++++ src/gromacs/gpu_utils/devicebuffer.cuh | 22 +-- src/gromacs/gpu_utils/devicebuffer_ocl.h | 35 ++--- .../gpu_utils/gpueventsynchronizer.cuh | 10 +- .../gpu_utils/gpueventsynchronizer_ocl.h | 10 +- src/gromacs/gpu_utils/gpuregiontimer.cuh | 12 +- src/gromacs/gpu_utils/gpuregiontimer.h | 14 +- src/gromacs/gpu_utils/gpuregiontimer_ocl.h | 6 +- src/gromacs/gpu_utils/gputraits.cuh | 12 +- src/gromacs/gpu_utils/gputraits.h | 2 - src/gromacs/gpu_utils/gputraits_ocl.h | 10 +- src/gromacs/gpu_utils/oclutils.h | 18 +-- .../gpu_utils/tests/typecasts_runner.cu | 7 +- src/gromacs/listed_forces/gpubonded.h | 4 +- src/gromacs/listed_forces/gpubonded_impl.cpp | 2 +- src/gromacs/listed_forces/gpubonded_impl.cu | 24 ++-- src/gromacs/listed_forces/gpubonded_impl.h | 7 +- src/gromacs/listed_forces/gpubondedkernels.cu | 2 +- src/gromacs/mdlib/leapfrog_gpu.cu | 12 +- src/gromacs/mdlib/leapfrog_gpu.cuh | 6 +- src/gromacs/mdlib/lincs_gpu.cu | 24 ++-- src/gromacs/mdlib/lincs_gpu.cuh | 9 +- src/gromacs/mdlib/settle_gpu.cu | 12 +- src/gromacs/mdlib/settle_gpu.cuh | 6 +- src/gromacs/mdlib/tests/constrtestrunners.cu | 13 +- .../mdlib/tests/leapfrogtestrunners.cu | 15 +- src/gromacs/mdlib/tests/settletestrunners.cu | 14 +- src/gromacs/mdlib/update_constrain_gpu.h | 8 +- .../mdlib/update_constrain_gpu_impl.cpp | 2 +- .../mdlib/update_constrain_gpu_impl.cu | 23 ++- src/gromacs/mdlib/update_constrain_gpu_impl.h | 8 +- src/gromacs/mdrun/md.cpp | 17 ++- src/gromacs/mdrun/runner.cpp | 10 +- .../mdtypes/state_propagator_data_gpu.h | 14 +- .../state_propagator_data_gpu_impl.cpp | 10 +- .../mdtypes/state_propagator_data_gpu_impl.h | 40 +++--- .../state_propagator_data_gpu_impl_gpu.cpp | 108 +++++++------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 102 ++++++------- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 99 +++++++------ src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 2 +- src/gromacs/nbnxm/gpu_common.h | 4 +- src/gromacs/nbnxm/gpu_data_mgmt.h | 3 +- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 85 +++++------ .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 66 ++++----- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 2 +- 76 files changed, 1000 insertions(+), 592 deletions(-) create mode 100644 src/gromacs/gpu_utils/device_stream.cpp create mode 100644 src/gromacs/gpu_utils/device_stream.cu create mode 100644 src/gromacs/gpu_utils/device_stream.h create mode 100644 src/gromacs/gpu_utils/device_stream_ocl.cpp diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index e020a1405b..ebcc92bf2e 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -3203,8 +3203,8 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRefcomm; const gmx_domdec_comm_dim_t& cd = comm.cd[0]; const gmx_domdec_ind_t& ind = cd.ind[pulse_]; @@ -167,7 +166,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch"); std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); - copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream, + copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr); // This rank will push data to its neighbor, so needs to know @@ -215,7 +214,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = nonLocalStream_; + config.stream = nonLocalStream_.stream(); const float3* sendBuf = d_sendBuf_; const float3* d_x = d_x_; @@ -264,7 +263,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) if (!accumulateForces) { // Clear local portion of force array (in local stream) - cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_); + cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream()); } // ensure non-local stream waits for local stream, due to dependence on @@ -286,7 +285,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = nonLocalStream_; + config.stream = nonLocalStream_.stream(); const float3* recvBuf = d_recvBuf_; const int* indexMap = d_indexMap_; @@ -373,8 +372,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, int recvRank) { - cudaError_t stat; - cudaStream_t stream = nonLocalStream_; + cudaError_t stat; // We asynchronously push data to remote rank. The remote // destination pointer has already been set in the init fn. We @@ -386,7 +384,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, if (sendSize > 0) { stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float), - cudaMemcpyDeviceToDevice, stream); + cudaMemcpyDeviceToDevice, nonLocalStream_.stream()); CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed"); } @@ -397,13 +395,13 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr, // to its stream. GpuEventSynchronizer* haloDataTransferRemote; - haloDataTransferLaunched_->markEvent(stream); + haloDataTransferLaunched_->markEvent(nonLocalStream_); MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0, &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); - haloDataTransferRemote->enqueueWaitEvent(stream); + haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_); #else GMX_UNUSED_VALUE(sendRank); GMX_UNUSED_VALUE(recvRank); @@ -419,8 +417,8 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent() GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - void* localStream, - void* nonLocalStream, + const DeviceStream& localStream, + const DeviceStream& nonLocalStream, int pulse) : dd_(dd), sendRankX_(dd->neighbor[0][1]), @@ -431,8 +429,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, haloDataTransferLaunched_(new GpuEventSynchronizer()), mpi_comm_mysim_(mpi_comm_mysim), deviceContext_(deviceContext), - localStream_(*static_cast(localStream)), - nonLocalStream_(*static_cast(nonLocalStream)), + localStream_(localStream), + nonLocalStream_(nonLocalStream), pulse_(pulse) { @@ -466,8 +464,8 @@ GpuHaloExchange::Impl::~Impl() GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - void* localStream, - void* nonLocalStream, + const DeviceStream& localStream, + const DeviceStream& nonLocalStream, int pulse) : impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse)) { diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index ba22bc5262..9a033ed54f 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -79,8 +79,8 @@ public: Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - void* localStream, - void* nonLocalStream, + const DeviceStream& localStream, + const DeviceStream& nonLocalStream, int pulse); ~Impl(); @@ -185,9 +185,9 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! CUDA stream for local non-bonded calculations - cudaStream_t localStream_ = nullptr; + const DeviceStream& localStream_; //! CUDA stream for non-local non-bonded calculations - cudaStream_t nonLocalStream_ = nullptr; + const DeviceStream& nonLocalStream_; //! full coordinates buffer in GPU memory float3* d_x_ = nullptr; //! full forces buffer in GPU memory diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 1c3cb9b774..8aa2c079b1 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -72,6 +72,7 @@ struct gmx_wallcycle; struct NumPmeDomains; class DeviceContext; +class DeviceStream; enum class GpuTaskCompletion; class PmeGpuProgram; class GpuEventSynchronizer; @@ -433,7 +434,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT * \param[in] pme The PME data structure. * \returns Pointer to GPU stream object. */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) +GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h index b5d02a719e..144bd27fdd 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h @@ -47,6 +47,7 @@ #include "gromacs/utility/classhelpers.h" #include "gromacs/utility/gmxmpi.h" +class DeviceStream; struct PpRanks; namespace gmx @@ -64,7 +65,7 @@ public: * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - PmeCoordinateReceiverGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); ~PmeCoordinateReceiverGpu(); /*! \brief diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp index 0cb848e6c0..b0da71cc47 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp @@ -62,7 +62,7 @@ class PmeCoordinateReceiverGpu::Impl }; /*!\brief Constructor stub. */ -PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */, +PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */, MPI_Comm /* comm */, gmx::ArrayRef /* ppRanks */) : impl_(nullptr) diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index b2e7fa009d..db81fb7b0a 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -55,8 +55,10 @@ namespace gmx { -PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : - pmeStream_(*static_cast(pmeStream)), +PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream& pmeStream, + MPI_Comm comm, + gmx::ArrayRef ppRanks) : + pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks) { @@ -122,7 +124,7 @@ void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDire } } -PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* pmeStream, +PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : impl_(new Impl(pmeStream, comm, ppRanks)) diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h index 4f3bbe2e4e..e1186a2f3a 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h @@ -62,7 +62,7 @@ public: * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); ~Impl(); /*! \brief @@ -84,7 +84,7 @@ public: private: //! CUDA stream for PME operations - cudaStream_t pmeStream_ = nullptr; + const DeviceStream& pmeStream_; //! communicator for simulation MPI_Comm comm_; //! list of PP ranks diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index c774994824..df8e1873f8 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -46,6 +46,8 @@ #include "gromacs/utility/classhelpers.h" #include "gromacs/utility/gmxmpi.h" +class DeviceStream; + /*! \libinternal * \brief Contains information about the PP ranks that partner this PME rank. */ struct PpRanks @@ -73,7 +75,7 @@ public: * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); ~PmeForceSenderGpu(); /*! \brief diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index b4a531fed9..3ae502ccd8 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -61,7 +61,7 @@ class PmeForceSenderGpu::Impl }; /*!\brief Constructor stub. */ -PmeForceSenderGpu::PmeForceSenderGpu(const void* /*pmeStream */, +PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */, MPI_Comm /* comm */, gmx::ArrayRef /* ppRanks */) : impl_(nullptr) diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 0ad8fbf123..6e6d21eaf2 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -55,8 +55,8 @@ namespace gmx { /*! \brief Create PME-PP GPU communication object */ -PmeForceSenderGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : - pmeStream_(*static_cast(pmeStream)), +PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : + pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks) { @@ -106,7 +106,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank) #endif } -PmeForceSenderGpu::PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks) : +PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& pmeStream, + MPI_Comm comm, + gmx::ArrayRef ppRanks) : impl_(new Impl(pmeStream, comm, ppRanks)) { } diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 5c1271b2dd..91fe1c1140 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -61,7 +61,7 @@ public: * \param[in] comm Communicator used for simulation * \param[in] ppRanks List of PP ranks */ - Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); ~Impl(); /*! \brief @@ -78,7 +78,7 @@ public: private: //! CUDA stream for PME operations - cudaStream_t pmeStream_ = nullptr; + const DeviceStream& pmeStream_; //! Event triggered when to allow remote PP stream to syn with pme stream GpuEventSynchronizer pmeSync_; //! communicator for simulation diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 4c4ed4851d..cbcab23b1a 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -433,7 +433,7 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer d_x) pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x); } -void* pme_gpu_get_device_stream(const gmx_pme_t* pme) +const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme) { if (!pme || !pme_gpu_active(pme)) { diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu index 2b30dcdef1..9f9578962e 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.cu +++ b/src/gromacs/ewald/pme_gpu_3dfft.cu @@ -104,7 +104,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) realGridSizePaddedTotal, CUFFT_C2R, batch); handleCufftError(result, "cufftPlanMany C2R plan failure"); - cudaStream_t stream = pmeGpu->archSpecific->pmeStream; + cudaStream_t stream = pmeGpu->archSpecific->pmeStream_.stream(); GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT"); result = cufftSetStream(planR2C_, stream); diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h index 07d3b1af57..fc6d67a935 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.h +++ b/src/gromacs/ewald/pme_gpu_3dfft.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -93,7 +93,7 @@ private: #elif GMX_GPU == GMX_GPU_OPENCL clfftPlanHandle planR2C_; clfftPlanHandle planC2R_; - std::vector commandStreams_; + std::vector deviceStreams_; cl_mem realGrid_; cl_mem complexGrid_; #endif diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp index c6e1b6448c..b341a27b82 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp @@ -81,7 +81,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) "Complex padding not implemented"); } cl_context context = pmeGpu->archSpecific->deviceContext_.context(); - commandStreams_.push_back(pmeGpu->archSpecific->pmeStream); + deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream()); realGrid_ = kernelParamsPtr->grid.d_realGrid; complexGrid_ = kernelParamsPtr->grid.d_fourierGrid; const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT; @@ -124,9 +124,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) handleClfftError(clfftSetPlanOutStride(planC2R_, dims, realGridStrides.data()), "clFFT stride setting failure"); - handleClfftError(clfftBakePlan(planR2C_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr), + handleClfftError(clfftBakePlan(planR2C_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr), "clFFT precompiling failure"); - handleClfftError(clfftBakePlan(planC2R_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr), + handleClfftError(clfftBakePlan(planC2R_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr), "clFFT precompiling failure"); // TODO: implement solve kernel as R2C FFT callback @@ -166,8 +166,8 @@ void GpuParallel3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingE GMX_THROW( gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs")); } - handleClfftError(clfftEnqueueTransform(plan, direction, commandStreams_.size(), - commandStreams_.data(), waitEvents.size(), waitEvents.data(), + handleClfftError(clfftEnqueueTransform(plan, direction, deviceStreams_.size(), + deviceStreams_.data(), waitEvents.size(), waitEvents.data(), timingEvent, inputGrids, outputGrids, tempBuffer), "clFFT execution failure"); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index dd62e8c4cd..822109de4c 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -135,7 +135,7 @@ int pme_gpu_get_atoms_per_warp(const PmeGpu* pmeGpu) void pme_gpu_synchronize(const PmeGpu* pmeGpu) { - gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->pmeStream_.synchronize(); } void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu) @@ -156,7 +156,7 @@ void pme_gpu_free_energy_virial(PmeGpu* pmeGpu) void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu) { clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0, - c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream); + c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_); } void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu) @@ -188,7 +188,7 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu) } /* TODO: pin original buffer instead! */ copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli, - 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream, + 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -219,7 +219,7 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu) GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0, - DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream, + DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -228,7 +228,7 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu) GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0, - DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream, + DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -243,7 +243,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_ pmeGpu->archSpecific->deviceContext_); copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); if (c_usePadding) { const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms; @@ -251,7 +251,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_ if (paddingCount > 0) { clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex, - paddingCount, pmeGpu->archSpecific->pmeStream); + paddingCount, pmeGpu->archSpecific->pmeStream_); } } } @@ -360,7 +360,7 @@ void pme_gpu_free_grids(const PmeGpu* pmeGpu) void pme_gpu_clear_grids(const PmeGpu* pmeGpu) { clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0, - pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_); } void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu) @@ -393,10 +393,10 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu) allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize, pmeGpu->archSpecific->deviceContext_); copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0, - newFractShiftsSize, pmeGpu->archSpecific->pmeStream, + newFractShiftsSize, pmeGpu->archSpecific->pmeStream_, GpuApiCallBehavior::Async, nullptr); copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0, - newFractShiftsSize, pmeGpu->archSpecific->pmeStream, + newFractShiftsSize, pmeGpu->archSpecific->pmeStream_, GpuApiCallBehavior::Async, nullptr); #endif } @@ -417,21 +417,21 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu) bool pme_gpu_stream_query(const PmeGpu* pmeGpu) { - return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream); + return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_); } void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid) { copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid) { copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0, - pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream, + pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); - pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_); } void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu) @@ -441,11 +441,11 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu) const size_t splinesCount = DIM * nAtomsPadded * pmeGpu->common->pme_order; auto* kernelParamsPtr = pmeGpu->kernelParams.get(); copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices, - 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream, + 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -459,20 +459,20 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu) { // TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0, - pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream); + pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_); clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0, pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM, - pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->pmeStream_); clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM, - pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->pmeStream_); } copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices, - 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream, + 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -529,35 +529,25 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu) int highest_priority, lowest_priority; stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority); CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed"); - stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream, + cudaStream_t stream; + stat = cudaStreamCreateWithPriority(&stream, cudaStreamDefault, // cudaStreamNonBlocking, highest_priority); + pmeGpu->archSpecific->pmeStream_.setStream(stream); CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed"); #elif GMX_GPU == GMX_GPU_OPENCL cl_command_queue_properties queueProperties = pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0; cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId; cl_int clError; - pmeGpu->archSpecific->pmeStream = clCreateCommandQueue( - pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError); - if (clError != CL_SUCCESS) - { - GMX_THROW(gmx::InternalError("Failed to create PME command queue")); - } -#endif -} + pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue( + pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError)); + -void pme_gpu_destroy_specific(const PmeGpu* pmeGpu) -{ -#if GMX_GPU == GMX_GPU_CUDA - /* Destroy the CUDA stream */ - cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream); - CU_RET_ERR(stat, "PME cudaStreamDestroy error"); -#elif GMX_GPU == GMX_GPU_OPENCL - cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream); if (clError != CL_SUCCESS) { - gmx_warning("Failed to destroy PME command queue"); + GMX_THROW(gmx::InternalError( + gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str())); } #endif } @@ -979,9 +969,6 @@ void pme_gpu_destroy(PmeGpu* pmeGpu) pme_gpu_destroy_3dfft(pmeGpu); - /* Free the GPU-framework specific data last */ - pme_gpu_destroy_specific(pmeGpu); - delete pmeGpu; } @@ -1205,7 +1192,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, "Need a valid coordinate synchronizer on PP+PME ranks with CUDA."); if (xReadyOnDevice) { - xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream); + xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_); } const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; @@ -1217,7 +1204,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, config.blockSize[2] = atomsPerBlock; config.gridSize[0] = dimGrid.first; config.gridSize[1] = dimGrid.second; - config.stream = pmeGpu->archSpecific->pmeStream; + config.stream = pmeGpu->archSpecific->pmeStream_.stream(); int timingId; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; @@ -1285,7 +1272,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd if (copyInputAndOutputGrid) { copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0, - pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream, + pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } @@ -1335,7 +1322,7 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock; config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim]; - config.stream = pmeGpu->archSpecific->pmeStream; + config.stream = pmeGpu->archSpecific->pmeStream_.stream(); int timingId = gtPME_SOLVE; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; @@ -1366,13 +1353,13 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd { copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy, &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } if (copyInputAndOutputGrid) { copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0, - pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream, + pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } } @@ -1457,7 +1444,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) config.blockSize[2] = atomsPerBlock; config.gridSize[0] = dimGrid.first; config.gridSize[1] = dimGrid.second; - config.stream = pmeGpu->archSpecific->pmeStream; + config.stream = pmeGpu->archSpecific->pmeStream_.stream(); // TODO test different cache configs @@ -1483,7 +1470,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) if (pmeGpu->settings.useGpuForceReduction) { - pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream_); } else { @@ -1515,11 +1502,11 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBufferkernelParams->atoms.d_coordinates = d_x; } -void* pme_gpu_get_stream(const PmeGpu* pmeGpu) +const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu) { if (pmeGpu) { - return static_cast(&pmeGpu->archSpecific->pmeStream); + return &pmeGpu->archSpecific->pmeStream_; } else { diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 67a1bc3d1c..b515e3b222 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -313,14 +313,6 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu); */ void pme_gpu_init_internal(PmeGpu* pmeGpu); -/*! \libinternal \brief - * Destroys the PME GPU-framework specific data. - * Should be called last in the PME GPU destructor. - * - * \param[in] pmeGpu The PME GPU structure. - */ -void pme_gpu_destroy_specific(const PmeGpu* pmeGpu); - /*! \libinternal \brief * Initializes the CUDA FFT structures. * @@ -405,7 +397,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_A * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to stream object. */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) +GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Return pointer to the sync object triggered after the PME force calculation completion diff --git a/src/gromacs/ewald/pme_gpu_timings.cpp b/src/gromacs/ewald/pme_gpu_timings.cpp index 3680631c57..3a1f457468 100644 --- a/src/gromacs/ewald/pme_gpu_timings.cpp +++ b/src/gromacs/ewald/pme_gpu_timings.cpp @@ -61,7 +61,7 @@ void pme_gpu_start_timing(const PmeGpu* pmeGpu, size_t PMEStageId) { GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index"); - pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream_); } } @@ -71,7 +71,7 @@ void pme_gpu_stop_timing(const PmeGpu* pmeGpu, size_t PMEStageId) { GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index"); - pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream); + pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream_); } } diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h index 44ca3fd3c3..a019a7c031 100644 --- a/src/gromacs/ewald/pme_gpu_types_host_impl.h +++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h @@ -74,8 +74,6 @@ struct PmeGpuSpecific * \param[in] deviceContext GPU device context. */ PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {} - /*! \brief The GPU stream where everything related to the PME happens. */ - CommandStream pmeStream; /*! \brief * A handle to the GPU context. @@ -85,6 +83,9 @@ struct PmeGpuSpecific */ const DeviceContext& deviceContext_; + /*! \brief The GPU stream where everything related to the PME happens. */ + DeviceStream pmeStream_; + /* Synchronization events */ /*! \brief Triggered after the PME Force Calculations have been completed */ GpuEventSynchronizer pmeForcesReady; diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 845b1a33ec..fe51deb5fc 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -629,16 +629,16 @@ int gmx_pmeonly(struct gmx_pme_t* pme, const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed); if (useGpuForPme) { - const void* commandStream = pme_gpu_get_device_stream(pme); + const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme); changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy()); changePinningPolicy(&pme_pp->x, pme_get_pinning_policy()); if (c_enableGpuPmePpComms) { pme_pp->pmeCoordinateReceiverGpu = std::make_unique( - commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks); + deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks); pme_pp->pmeForceSenderGpu = std::make_unique( - commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks); + deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks); } GMX_RELEASE_ASSERT( deviceContext != nullptr, @@ -646,7 +646,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here. // This should be made safer. stateGpu = std::make_unique( - commandStream, *deviceContext, GpuApiCallBehavior::Async, + &deviceStream, *deviceContext, GpuApiCallBehavior::Async, pme_gpu_get_padding_size(pme), wcycle); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 91962ff947..f5aac2981a 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -64,7 +64,9 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device GMX_RELEASE_ASSERT( GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); - cudaStreamCreate(&pmePpCommStream_); + cudaStream_t stream; + cudaStreamCreate(&stream); + pmePpCommStream_.setStream(stream); } PmePpCommGpu::Impl::~Impl() = default; @@ -98,7 +100,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi // Pull force data from remote GPU void* pmeForcePtr = receivePmeForceToGpu ? static_cast(d_pmeForces_) : recvPtr; cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, remotePmeFBuffer_, recvSize * DIM * sizeof(float), - cudaMemcpyDefault, pmePpCommStream_); + cudaMemcpyDefault, pmePpCommStream_.stream()); CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed"); if (receivePmeForceToGpu) @@ -112,7 +114,7 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi { // Ensure CPU waits for PME forces to be copied before reducing // them with other forces on the CPU - cudaStreamSynchronize(pmePpCommStream_); + cudaStreamSynchronize(pmePpCommStream_.stream()); } #else GMX_UNUSED_VALUE(recvPtr); @@ -131,7 +133,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr, coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_); cudaError_t stat = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr, sendSize * DIM * sizeof(float), - cudaMemcpyDefault, pmePpCommStream_); + cudaMemcpyDefault, pmePpCommStream_.stream()); CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed"); // Record and send event to allow PME task to sync to above transfer before commencing force calculations diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index c791ea5b40..934b7c40c6 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -119,7 +119,7 @@ private: //! Device context object const DeviceContext& deviceContext_; //! CUDA stream used for the communication operations in this class - cudaStream_t pmePpCommStream_ = nullptr; + DeviceStream pmePpCommStream_; //! Remote location of PME coordinate data buffer void* remotePmeXBuffer_ = nullptr; //! Remote location of PME force data buffer diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index 8672e450ca..ce70d0b049 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -45,6 +45,7 @@ gmx_add_libgromacs_sources( if(GMX_USE_OPENCL) gmx_add_libgromacs_sources( device_context_ocl.cpp + device_stream_ocl.cpp gpu_utils_ocl.cpp ocl_compiler.cpp ocl_caching.cpp @@ -53,10 +54,15 @@ if(GMX_USE_OPENCL) elseif(GMX_USE_CUDA) gmx_add_libgromacs_sources( cudautils.cu + device_stream.cu gpu_utils.cu pinning.cu pmalloc_cuda.cu ) +elseif() + gmx_add_libgromacs_sources( + device_stream.cpp + ) endif() if (BUILD_TESTING) diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 71d9b7dac4..48212bf3bc 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -216,25 +216,15 @@ static inline void rvec_inc(rvec a, const float3 b) rvec_inc(a, tmp); } -/*! \brief Wait for all taks in stream \p s to complete. - * - * \param[in] s stream to synchronize with - */ -static inline void gpuStreamSynchronize(cudaStream_t s) -{ - cudaError_t stat = cudaStreamSynchronize(s); - CU_RET_ERR(stat, "cudaStreamSynchronize failed"); -} - /*! \brief Returns true if all tasks in \p s have completed. * - * \param[in] s stream to check + * \param[in] deviceStream CUDA stream to check. * - * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed. + * \returns True if all tasks enqueued in the stream \p deviceStream (at the time of this call) have completed. */ -static inline bool haveStreamTasksCompleted(cudaStream_t s) +static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream) { - cudaError_t stat = cudaStreamQuery(s); + cudaError_t stat = cudaStreamQuery(deviceStream.stream()); if (stat == cudaErrorNotReady) { diff --git a/src/gromacs/gpu_utils/device_stream.cpp b/src/gromacs/gpu_utils/device_stream.cpp new file mode 100644 index 0000000000..1b5b016fba --- /dev/null +++ b/src/gromacs/gpu_utils/device_stream.cpp @@ -0,0 +1,58 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements the DeviceContext for OpenCL + * + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + */ +#include "gmxpre.h" + +#include "device_stream.h" + +DeviceStream::DeviceStream() = default; + +void DeviceStream::init(const DeviceInformation& /* deviceInfo */, + const DeviceContext& /* deviceContext */, + DeviceStreamPriority /* priority */, + const bool /* useTiming */) +{ +} + +DeviceStream::~DeviceStream() = default; + +void DeviceStream::synchronize() const {} \ No newline at end of file diff --git a/src/gromacs/gpu_utils/device_stream.cu b/src/gromacs/gpu_utils/device_stream.cu new file mode 100644 index 0000000000..8d0b484846 --- /dev/null +++ b/src/gromacs/gpu_utils/device_stream.cu @@ -0,0 +1,122 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements the DeviceContext for OpenCL + * + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + */ +#include "gmxpre.h" + +#include "device_stream.h" + +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/stringutil.h" + +DeviceStream::DeviceStream() +{ + stream_ = nullptr; +} + +void DeviceStream::init(const DeviceInformation& /* deviceInfo */, + const DeviceContext& /* deviceContext */, + DeviceStreamPriority priority, + const bool /* useTiming */) +{ + cudaError_t stat; + + if (priority == DeviceStreamPriority::Normal) + { + stat = cudaStreamCreate(&stream_); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat)))); + } + } + else if (priority == DeviceStreamPriority::High) + { + // Note that the device we're running on does not have to + // support priorities, because we are querying the priority + // range, which in that case will be a single value. + int highestPriority; + stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "Could not query CUDA stream priority range (CUDA error %d: %s).", stat, + cudaGetErrorString(stat)))); + } + + stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority); + if (stat != cudaSuccess) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat, + cudaGetErrorString(stat)))); + } + } +} + +DeviceStream::~DeviceStream() +{ + if (stream_) + { + cudaError_t stat = cudaStreamDestroy(stream_); + GMX_RELEASE_ASSERT(stat == cudaSuccess, + gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).", + stat, cudaGetErrorString(stat)) + .c_str()); + stream_ = nullptr; + } +} + +cudaStream_t DeviceStream::stream() const +{ + return stream_; +} + +void DeviceStream::synchronize() const +{ + cudaError_t stat = cudaStreamSynchronize(stream_); + GMX_RELEASE_ASSERT(stat == cudaSuccess, + gmx::formatString("cudaStreamSynchronize failed (CUDA error %d: %s).", stat, + cudaGetErrorString(stat)) + .c_str()); +} \ No newline at end of file diff --git a/src/gromacs/gpu_utils/device_stream.h b/src/gromacs/gpu_utils/device_stream.h new file mode 100644 index 0000000000..2e654e529b --- /dev/null +++ b/src/gromacs/gpu_utils/device_stream.h @@ -0,0 +1,134 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +#ifndef GMX_GPU_UTILS_DEVICE_STREAM_H +#define GMX_GPU_UTILS_DEVICE_STREAM_H + +/*! \libinternal \file + * + * \brief Declarations for DeviceStream class. + * + * \author Artem Zhmurov + * \author Mark Abraham + * + * \ingroup module_gpu_utils + * \inlibraryapi + */ + +#include "config.h" + +#if GMX_GPU == GMX_GPU_OPENCL +# include "gromacs/gpu_utils/gmxopencl.h" +#endif +#include "gromacs/utility/classhelpers.h" + +struct DeviceInformation; +class DeviceContext; + +//! Enumeration describing the priority with which a stream operates. +enum class DeviceStreamPriority : int +{ + //! High-priority stream + High, + //! Normal-priority stream + Normal, + //! Conventional termination of the enumeration + Count +}; + +// Stub for device context +class DeviceStream +{ +public: + //! Default constructor + DeviceStream(); + //! Destructor + ~DeviceStream(); + + /*! \brief Initialize + * + * \param[in] deviceInfo Platform-specific device information (only used in OpenCL). + * \param[in] deviceContext Device context (not used in CUDA). + * \param[in] priority Stream priority: high or normal. + * \param[in] useTiming If the timing should be enabled (not used in CUDA). + */ + void init(const DeviceInformation& deviceInfo, + const DeviceContext& deviceContext, + DeviceStreamPriority priority, + const bool useTiming); + + /*! \brief Construct and init. + * + * \param[in] deviceInfo Platform-specific device information (only used in OpenCL). + * \param[in] deviceContext Device context (only used in OpenCL). + * \param[in] priority Stream priority: high or normal (only used in CUDA). + * \param[in] useTiming If the timing should be enabled (only used in OpenCL). + */ + DeviceStream(const DeviceInformation& deviceInfo, + const DeviceContext& deviceContext, + DeviceStreamPriority priority, + const bool useTiming) + { + init(deviceInfo, deviceContext, priority, useTiming); + } + + //! Synchronize the steam + void synchronize() const; + +#if GMX_GPU == GMX_GPU_CUDA + + //! Getter + cudaStream_t stream() const; + //! Setter (temporary, will be removed in the follow-up) + void setStream(cudaStream_t stream) { stream_ = stream; } + +private: + cudaStream_t stream_ = nullptr; + +#elif GMX_GPU == GMX_GPU_OPENCL + + //! Getter + cl_command_queue stream() const; + //! Setter (temporary, will be removed in the follow-up) + void setStream(cl_command_queue stream) { stream_ = stream; } + +private: + cl_command_queue stream_ = nullptr; + +#endif + + GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceStream); +}; + +#endif // GMX_GPU_UTILS_DEVICE_STREAM_H diff --git a/src/gromacs/gpu_utils/device_stream_ocl.cpp b/src/gromacs/gpu_utils/device_stream_ocl.cpp new file mode 100644 index 0000000000..013480aacf --- /dev/null +++ b/src/gromacs/gpu_utils/device_stream_ocl.cpp @@ -0,0 +1,97 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements the DeviceStream for OpenCL. + * + * \author Artem Zhmurov + * + * \ingroup module_gpu_utils + */ +#include "gmxpre.h" + +#include "gromacs/gpu_utils/device_context_ocl.h" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/gputraits_ocl.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/stringutil.h" + +DeviceStream::DeviceStream() +{ + stream_ = nullptr; +} + +void DeviceStream::init(const DeviceInformation& deviceInfo, + const DeviceContext& deviceContext, + DeviceStreamPriority /* priority */, + const bool useTiming) +{ + cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0; + cl_device_id deviceId = deviceInfo.oclDeviceId; + cl_int clError; + stream_ = clCreateCommandQueue(deviceContext.context(), deviceId, queueProperties, &clError); + if (clError != CL_SUCCESS) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "Failed to create OpenCL command queue on GPU %s (OpenCL error ID %d).", + deviceInfo.device_name, clError))); + } +} + +DeviceStream::~DeviceStream() +{ + if (stream_) + { + cl_int clError = clReleaseCommandQueue(stream_); + GMX_RELEASE_ASSERT( + clError == CL_SUCCESS, + gmx::formatString("Failed to release OpenCL stream (OpenCL error ID %d).", clError).c_str()); + stream_ = nullptr; + } +} + +cl_command_queue DeviceStream::stream() const +{ + return stream_; +} + +void DeviceStream::synchronize() const +{ + cl_int clError = clFinish(stream_); + GMX_RELEASE_ASSERT( + CL_SUCCESS == clError, + gmx::formatString("Error caught during clFinish (OpenCL error ID %d).", clError).c_str()); +} \ No newline at end of file diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh index 59255bfa93..d4bfe8c35b 100644 --- a/src/gromacs/gpu_utils/devicebuffer.cuh +++ b/src/gromacs/gpu_utils/devicebuffer.cuh @@ -96,7 +96,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer) * \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType * \param[in] startingOffset Offset (in values) at the device-side buffer to copy into. * \param[in] numValues Number of values to copy. - * \param[in] stream GPU stream to perform asynchronous copy in. + * \param[in] deviceStream GPU stream to perform asynchronous copy in. * \param[in] transferKind Copy type: synchronous or asynchronous. * \param[out] timingEvent A dummy pointer to the H2D copy timing event to be filled in. * Not used in CUDA implementation. @@ -106,7 +106,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, const ValueType* hostBuffer, size_t startingOffset, size_t numValues, - CommandStream stream, + const DeviceStream& deviceStream, GpuApiCallBehavior transferKind, CommandEvent* /*timingEvent*/) { @@ -125,7 +125,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Source host buffer was not pinned for CUDA"); stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes, - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, deviceStream.stream()); GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed"); break; @@ -150,7 +150,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, * \param[in] buffer Pointer to the device-side buffer * \param[in] startingOffset Offset (in values) at the device-side buffer to copy from. * \param[in] numValues Number of values to copy. - * \param[in] stream GPU stream to perform asynchronous copy in. + * \param[in] deviceStream GPU stream to perform asynchronous copy in. * \param[in] transferKind Copy type: synchronous or asynchronous. * \param[out] timingEvent A dummy pointer to the H2D copy timing event to be filled in. * Not used in CUDA implementation. @@ -160,7 +160,7 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, DeviceBuffer* buffer, size_t startingOffset, size_t numValues, - CommandStream stream, + const DeviceStream& deviceStream, GpuApiCallBehavior transferKind, CommandEvent* /*timingEvent*/) { @@ -175,7 +175,7 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, GMX_ASSERT(isHostMemoryPinned(hostBuffer), "Destination host buffer was not pinned for CUDA"); stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes, - cudaMemcpyDeviceToHost, stream); + cudaMemcpyDeviceToHost, deviceStream.stream()); GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed"); break; @@ -196,16 +196,20 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, * \param[in,out] buffer Pointer to the device-side buffer * \param[in] startingOffset Offset (in values) at the device-side buffer to start clearing at. * \param[in] numValues Number of values to clear. - * \param[in] stream GPU stream. + * \param[in] deviceStream GPU stream. */ template -void clearDeviceBufferAsync(DeviceBuffer* buffer, size_t startingOffset, size_t numValues, CommandStream stream) +void clearDeviceBufferAsync(DeviceBuffer* buffer, + size_t startingOffset, + size_t numValues, + const DeviceStream& deviceStream) { GMX_ASSERT(buffer, "needs a buffer pointer"); const size_t bytes = numValues * sizeof(ValueType); const char pattern = 0; - cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes, stream); + cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes, + deviceStream.stream()); GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer"); } diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index ee1adc1cce..05be260b89 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -108,7 +108,7 @@ void freeDeviceBuffer(DeviceBuffer* buffer) * \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType * \param[in] startingOffset Offset (in values) at the device-side buffer to copy into. * \param[in] numValues Number of values to copy. - * \param[in] stream GPU stream to perform asynchronous copy in. + * \param[in] deviceStream GPU stream to perform asynchronous copy in. * \param[in] transferKind Copy type: synchronous or asynchronous. * \param[out] timingEvent A pointer to the H2D copy timing event to be filled in. * If the pointer is not null, the event can further be used @@ -119,7 +119,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, const ValueType* hostBuffer, size_t startingOffset, size_t numValues, - CommandStream stream, + const DeviceStream& deviceStream, GpuApiCallBehavior transferKind, CommandEvent* timingEvent) { @@ -135,8 +135,8 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, switch (transferKind) { case GpuApiCallBehavior::Async: - clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0, - nullptr, timingEvent); + clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes, + hostBuffer, 0, nullptr, timingEvent); GMX_RELEASE_ASSERT( clError == CL_SUCCESS, gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError, @@ -145,8 +145,8 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, break; case GpuApiCallBehavior::Sync: - clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0, - nullptr, timingEvent); + clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes, + hostBuffer, 0, nullptr, timingEvent); GMX_RELEASE_ASSERT( clError == CL_SUCCESS, gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError, @@ -168,7 +168,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, * \param[in] buffer Pointer to the device-side buffer * \param[in] startingOffset Offset (in values) at the device-side buffer to copy from. * \param[in] numValues Number of values to copy. - * \param[in] stream GPU stream to perform asynchronous copy in. + * \param[in] deviceStream GPU stream to perform asynchronous copy in. * \param[in] transferKind Copy type: synchronous or asynchronous. * \param[out] timingEvent A pointer to the H2D copy timing event to be filled in. * If the pointer is not null, the event can further be used @@ -179,7 +179,7 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, DeviceBuffer* buffer, size_t startingOffset, size_t numValues, - CommandStream stream, + const DeviceStream& deviceStream, GpuApiCallBehavior transferKind, CommandEvent* timingEvent) { @@ -191,8 +191,8 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, switch (transferKind) { case GpuApiCallBehavior::Async: - clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0, - nullptr, timingEvent); + clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes, + hostBuffer, 0, nullptr, timingEvent); GMX_RELEASE_ASSERT( clError == CL_SUCCESS, gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError, @@ -201,8 +201,8 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, break; case GpuApiCallBehavior::Sync: - clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0, - nullptr, timingEvent); + clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes, + hostBuffer, 0, nullptr, timingEvent); GMX_RELEASE_ASSERT( clError == CL_SUCCESS, gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError, @@ -221,10 +221,13 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, * \param[in,out] buffer Pointer to the device-side buffer * \param[in] startingOffset Offset (in values) at the device-side buffer to start clearing at. * \param[in] numValues Number of values to clear. - * \param[in] stream GPU stream. + * \param[in] deviceStream GPU stream. */ template -void clearDeviceBufferAsync(DeviceBuffer* buffer, size_t startingOffset, size_t numValues, CommandStream stream) +void clearDeviceBufferAsync(DeviceBuffer* buffer, + size_t startingOffset, + size_t numValues, + const DeviceStream& deviceStream) { GMX_ASSERT(buffer, "needs a buffer pointer"); const size_t offset = startingOffset * sizeof(ValueType); @@ -233,8 +236,8 @@ void clearDeviceBufferAsync(DeviceBuffer* buffer, size_t startingOffs const cl_uint numWaitEvents = 0; const cl_event* waitEvents = nullptr; cl_event commandEvent; - cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes, - numWaitEvents, waitEvents, &commandEvent); + cl_int clError = clEnqueueFillBuffer(deviceStream.stream(), *buffer, &pattern, sizeof(pattern), + offset, bytes, numWaitEvents, waitEvents, &commandEvent); GMX_RELEASE_ASSERT(clError == CL_SUCCESS, gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)", clError, ocl_get_error_string(clError).c_str()) diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh index e2e6ac8704..69487de586 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh +++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -80,9 +80,9 @@ public: /*! \brief Marks the synchronization point in the \p stream. * Should be followed by waitForEvent(). */ - inline void markEvent(CommandStream stream) + inline void markEvent(const DeviceStream& deviceStream) { - cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, stream); + cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream()); GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed"); } /*! \brief Synchronizes the host thread on the marked event. */ @@ -92,9 +92,9 @@ public: GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed"); } /*! \brief Enqueues a wait for the recorded event in stream \p stream */ - inline void enqueueWaitEvent(CommandStream stream) + inline void enqueueWaitEvent(const DeviceStream& deviceStream) { - cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(stream, event_, 0); + cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0); GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed"); } diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h index b9298f385f..9a62b5b6d4 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -85,10 +85,10 @@ public: /*! \brief Marks the synchronization point in the \p stream. * Should be called first and then followed by waitForEvent(). */ - inline void markEvent(CommandStream stream) + inline void markEvent(const DeviceStream& deviceStream) { GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!"); - cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_); + cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_); if (CL_SUCCESS != clError) { GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: " @@ -112,9 +112,9 @@ public: * After enqueue, the associated event is released, so this method should * be only called once per markEvent() call. */ - inline void enqueueWaitEvent(CommandStream stream) + inline void enqueueWaitEvent(const DeviceStream& deviceStream) { - cl_int clError = clEnqueueBarrierWithWaitList(stream, 1, &event_, nullptr); + cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr); if (CL_SUCCESS != clError) { GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: " diff --git a/src/gromacs/gpu_utils/gpuregiontimer.cuh b/src/gromacs/gpu_utils/gpuregiontimer.cuh index 52e36c038a..c56d60da61 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer.cuh +++ b/src/gromacs/gpu_utils/gpuregiontimer.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -79,15 +79,17 @@ public: GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete; /*! \brief Will be called before the region start. */ - inline void openTimingRegion(CommandStream s) + inline void openTimingRegion(const DeviceStream& deviceStream) { - CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure"); + CU_RET_ERR(cudaEventRecord(eventStart_, deviceStream.stream()), + "GPU timing recording failure"); } /*! \brief Will be called after the region end. */ - inline void closeTimingRegion(CommandStream s) + inline void closeTimingRegion(const DeviceStream& deviceStream) { - CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure"); + CU_RET_ERR(cudaEventRecord(eventStop_, deviceStream.stream()), + "GPU timing recording failure"); } /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */ diff --git a/src/gromacs/gpu_utils/gpuregiontimer.h b/src/gromacs/gpu_utils/gpuregiontimer.h index 6798159ac9..f0860b164d 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer.h +++ b/src/gromacs/gpu_utils/gpuregiontimer.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -84,9 +84,9 @@ public: /*! \brief * To be called before the region start. * - * \param[in] s The GPU command stream where the event being measured takes place. + * \param[in] deviceStream The GPU command stream where the event being measured takes place. */ - void openTimingRegion(CommandStream s) + void openTimingRegion(const DeviceStream& deviceStream) { if (c_debugTimerState) { @@ -96,14 +96,14 @@ public: GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str()); debugState_ = TimerState::Recording; } - impl_.openTimingRegion(s); + impl_.openTimingRegion(deviceStream); } /*! \brief * To be called after the region end. * - * \param[in] s The GPU command stream where the event being measured takes place. + * \param[in] deviceStream The GPU command stream where the event being measured takes place. */ - void closeTimingRegion(CommandStream s) + void closeTimingRegion(const DeviceStream& deviceStream) { if (c_debugTimerState) { @@ -114,7 +114,7 @@ public: debugState_ = TimerState::Stopped; } callCount_++; - impl_.closeTimingRegion(s); + impl_.closeTimingRegion(deviceStream); } /*! \brief * Accumulates the last timespan of all the events used into the total duration, diff --git a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h index 3c1d9b2b84..788e41de5d 100644 --- a/src/gromacs/gpu_utils/gpuregiontimer_ocl.h +++ b/src/gromacs/gpu_utils/gpuregiontimer_ocl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -82,9 +82,9 @@ public: GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete; /*! \brief Should be called before the region start. */ - inline void openTimingRegion(CommandStream /*unused*/) {} + inline void openTimingRegion(const DeviceStream& /*unused*/) {} /*! \brief Should be called after the region end. */ - inline void closeTimingRegion(CommandStream /*unused*/) {} + inline void closeTimingRegion(const DeviceStream& /*unused*/) {} /*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */ inline double getLastRangeTime() { diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index b477cdcb4c..ec3424a8f4 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -45,6 +45,8 @@ * \ingroup module_gpu_utils */ +#include "gromacs/gpu_utils/device_stream.h" + /*! \brief CUDA device information. * * The CUDA device information is queried and set at detection and contains @@ -61,8 +63,6 @@ struct DeviceInformation int stat; }; -//! \brief GPU command stream -using CommandStream = cudaStream_t; //! \brief Single GPU call timing event - meaningless in CUDA using CommandEvent = void; @@ -73,10 +73,10 @@ using CommandEvent = void; */ struct KernelLaunchConfig { - size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts - size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts - size_t sharedMemorySize = 0; //!< Shared memory size in bytes - CommandStream stream = nullptr; //!< Stream to launch kernel in + size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts + size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts + size_t sharedMemorySize = 0; //!< Shared memory size in bytes + cudaStream_t stream = nullptr; //!< Stream to launch kernel in }; //! Sets whether device code can use arrays that are embedded in structs. diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index a36a5cc3bc..5fec00303a 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -63,8 +63,6 @@ struct DeviceInformation // No member needed }; -//! \brief GPU command stream -using CommandStream = void*; //! \brief Single GPU call timing event using CommandEvent = void*; diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index caf837552a..0438c084d1 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -79,8 +79,6 @@ struct DeviceInformation size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE). }; -//! \brief GPU command stream -using CommandStream = cl_command_queue; //! \brief Single GPU call timing event using CommandEvent = cl_event; @@ -91,10 +89,10 @@ using CommandEvent = cl_event; */ struct KernelLaunchConfig { - size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts - size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts - size_t sharedMemorySize = 0; //!< Shared memory size in bytes - CommandStream stream = nullptr; //!< Stream to launch kernel in + size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts + size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts + size_t sharedMemorySize = 0; //!< Shared memory size in bytes + cl_command_queue stream = nullptr; //!< Stream to launch kernel in }; /*! \brief Sets whether device code can use arrays that are embedded in structs. diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index 90f5b04bfe..5e36d91ffd 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -45,6 +45,7 @@ #include #include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/gmxopencl.h" #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/utility/exceptions.h" @@ -130,17 +131,6 @@ void pfree(void* h_ptr); /*! \brief Convert error code to diagnostic string */ std::string ocl_get_error_string(cl_int error); -/*! \brief Calls clFinish() in the stream \p s. - * - * \param[in] s stream to synchronize with - */ -static inline void gpuStreamSynchronize(cl_command_queue s) -{ - cl_int cl_error = clFinish(s); - GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, - ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str()); -} - //! A debug checker to track cl_events being released correctly inline void ensureReferenceCount(const cl_event& event, unsigned int refCount) { @@ -156,11 +146,9 @@ inline void ensureReferenceCount(const cl_event& event, unsigned int refCount) /*! \brief Pretend to synchronize an OpenCL stream (dummy implementation). * - * \param[in] s queue to check - * - * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed. + * \returns Not implemented in OpenCL. */ -static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s) +static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */) { GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL"); return false; diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu index e16dd8ebf5..682035bb5f 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu @@ -112,12 +112,13 @@ void convertRVecToFloat3OnDevice(std::vector& h_rVecOutput, const std { DeviceInformation deviceInfo; const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false); const int numElements = h_rVecInput.size(); DeviceBuffer d_rVecInput; allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext); - copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr, + copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, deviceStream, GpuApiCallBehavior::Sync, nullptr); DeviceBuffer d_float3Output; @@ -131,14 +132,14 @@ void convertRVecToFloat3OnDevice(std::vector& h_rVecOutput, const std kernelLaunchConfig.blockSize[1] = 1; kernelLaunchConfig.blockSize[2] = 1; kernelLaunchConfig.sharedMemorySize = 0; - kernelLaunchConfig.stream = nullptr; + kernelLaunchConfig.stream = deviceStream.stream(); auto kernelPtr = convertRVecToFloat3OnDevice_kernel; const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig, &d_float3Output, &d_rVecInput, &numElements); launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs); - copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, nullptr, + copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream, GpuApiCallBehavior::Sync, nullptr); saveFloat3InRVecFormat(h_rVecOutput, h_float3Output.data(), numElements); diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h index b1c69d4572..e2c114ce4b 100644 --- a/src/gromacs/listed_forces/gpubonded.h +++ b/src/gromacs/listed_forces/gpubonded.h @@ -56,6 +56,8 @@ #include "gromacs/utility/classhelpers.h" class DeviceContext; +class DeviceStream; + struct gmx_enerdata_t; struct gmx_ffparams_t; struct gmx_mtop_t; @@ -109,7 +111,7 @@ public: //! Construct the manager with constant data and the stream to use. GpuBonded(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, - void* streamPtr, + const DeviceStream& deviceStream, gmx_wallcycle* wcycle); //! Destructor ~GpuBonded(); diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp index f24103229b..ec8e43323e 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cpp +++ b/src/gromacs/listed_forces/gpubonded_impl.cpp @@ -162,7 +162,7 @@ class GpuBonded::Impl GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */, const DeviceContext& /* deviceContext */, - void* /*streamPtr */, + const DeviceStream& /* deviceStream */, gmx_wallcycle* /* wcycle */) : impl_(nullptr) { diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu index 763550c5c9..0d5367f698 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.cu +++ b/src/gromacs/listed_forces/gpubonded_impl.cu @@ -66,22 +66,22 @@ namespace gmx GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, - void* streamPtr, + const DeviceStream& deviceStream, gmx_wallcycle* wcycle) : - deviceContext_(deviceContext) + deviceContext_(deviceContext), + deviceStream_(deviceStream) { - stream_ = *static_cast(streamPtr); wcycle_ = wcycle; allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_); // This could be an async transfer (if the source is pinned), so // long as it uses the same stream as the kernels and we are happy // to consume additional pinned pages. - copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_, - GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), + deviceStream_, GpuApiCallBehavior::Sync, nullptr); vTot_.resize(F_NRE); allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_); - clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_); + clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream); kernelParams_.d_forceParams = d_forceParams_; kernelParams_.d_xq = d_xq_; @@ -212,7 +212,7 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, deviceContext_); - copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_, + copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), deviceStream_, GpuApiCallBehavior::Async, nullptr); } kernelParams_.fTypesOnGpu[fTypesCounter] = fType; @@ -270,7 +270,7 @@ void GpuBonded::Impl::launchEnergyTransfer() wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED); // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage) float* h_vTot = vTot_.data(); - copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, stream_, GpuApiCallBehavior::Async, nullptr); + copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr); wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); } @@ -281,7 +281,7 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd) "accumulation should not occur"); wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED); - cudaError_t stat = cudaStreamSynchronize(stream_); + cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream()); CU_RET_ERR(stat, "D2H transfer of bonded energies failed"); wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED); @@ -304,7 +304,7 @@ void GpuBonded::Impl::clearEnergies() { wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED); - clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_); + clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_); wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED); wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } @@ -313,9 +313,9 @@ void GpuBonded::Impl::clearEnergies() GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, - void* streamPtr, + const DeviceStream& deviceStream, gmx_wallcycle* wcycle) : - impl_(new Impl(ffparams, deviceContext, streamPtr, wcycle)) + impl_(new Impl(ffparams, deviceContext, deviceStream, wcycle)) { } diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h index a0da918893..32cce2599a 100644 --- a/src/gromacs/listed_forces/gpubonded_impl.h +++ b/src/gromacs/listed_forces/gpubonded_impl.h @@ -126,7 +126,10 @@ class GpuBonded::Impl { public: //! Constructor - Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, void* streamPtr, gmx_wallcycle* wcycle); + Impl(const gmx_ffparams_t& ffparams, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + gmx_wallcycle* wcycle); /*! \brief Destructor, non-default needed for freeing * device-side buffers */ ~Impl(); @@ -183,7 +186,7 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! \brief Bonded GPU stream, not owned by this module - CommandStream stream_; + const DeviceStream& deviceStream_; //! Parameters and pointers, passed to the CUDA kernel BondedCudaKernelParameters kernelParams_; diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index 166e5bb87d..e03a3f1fa1 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -850,7 +850,7 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box) config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED; config.gridSize[1] = 1; config.gridSize[2] = 1; - config.stream = stream_; + config.stream = deviceStream_.stream(); auto kernelPtr = exec_kernel_gpu; kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ; diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu index b77162c1af..a7e19c922d 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cu +++ b/src/gromacs/mdlib/leapfrog_gpu.cu @@ -287,7 +287,7 @@ void LeapFrogGpu::integrate(const float3* d_x, h_lambdas_[i] = tcstat[i].lambda; } copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_, - commandStream_, GpuApiCallBehavior::Async, nullptr); + deviceStream_, GpuApiCallBehavior::Async, nullptr); } VelocityScalingType prVelocityScalingType = VelocityScalingType::None; if (doParrinelloRahman) @@ -316,9 +316,9 @@ void LeapFrogGpu::integrate(const float3* d_x, return; } -LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) : +LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) : deviceContext_(deviceContext), - commandStream_(commandStream) + deviceStream_(deviceStream) { numAtoms_ = 0; @@ -328,7 +328,7 @@ LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream comma kernelLaunchConfig_.blockSize[1] = 1; kernelLaunchConfig_.blockSize[2] = 1; kernelLaunchConfig_.sharedMemorySize = 0; - kernelLaunchConfig_.stream = commandStream_; + kernelLaunchConfig_.stream = deviceStream_.stream(); } LeapFrogGpu::~LeapFrogGpu() @@ -345,7 +345,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); - copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_, + copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); // Temperature scale group map only used if there are more then one group @@ -353,7 +353,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u { reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_); - copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_, + copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); } diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh index 26a6fc7399..6097a9a87e 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cuh +++ b/src/gromacs/mdlib/leapfrog_gpu.cuh @@ -64,9 +64,9 @@ public: /*! \brief Constructor. * * \param[in] deviceContext Device context (dummy in CUDA). - * \param[in] commandStream Device command stream to use. + * \param[in] deviceStream Device stream to use. */ - LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream); + LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream); ~LeapFrogGpu(); /*! \brief Integrate @@ -115,7 +115,7 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! GPU stream - CommandStream commandStream_; + const DeviceStream& deviceStream_; //! GPU kernel launch config KernelLaunchConfig kernelLaunchConfig_; //! Number of atoms diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu index 03c1bd1d15..61bc717852 100644 --- a/src/gromacs/mdlib/lincs_gpu.cu +++ b/src/gromacs/mdlib/lincs_gpu.cu @@ -447,7 +447,7 @@ void LincsGpu::apply(const float3* d_x, { // Fill with zeros so the values can be reduced to it // Only 6 values are needed because virial is symmetrical - clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_); + clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_); } auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial); @@ -475,7 +475,7 @@ void LincsGpu::apply(const float3* d_x, { config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float); } - config.stream = commandStream_; + config.stream = deviceStream_.stream(); kernelParams_.pbcAiuc = pbcAiuc; @@ -488,7 +488,7 @@ void LincsGpu::apply(const float3* d_x, { // Copy LINCS virial data and add it to the common virial copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6, - commandStream_, GpuApiCallBehavior::Sync, nullptr); + deviceStream_, GpuApiCallBehavior::Sync, nullptr); // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object virialScaled[XX][XX] += h_virialScaled_[0]; @@ -510,9 +510,9 @@ void LincsGpu::apply(const float3* d_x, LincsGpu::LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, - CommandStream commandStream) : + const DeviceStream& deviceStream) : deviceContext_(deviceContext), - commandStream_(commandStream) + deviceStream_(deviceStream) { kernelParams_.numIterations = numIterations; kernelParams_.expansionOrder = expansionOrder; @@ -943,23 +943,23 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md) // Copy data to GPU. copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0, - kernelParams_.numConstraintsThreads, commandStream_, - GpuApiCallBehavior::Sync, nullptr); + kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync, + nullptr); copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths, constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads, - commandStream_, GpuApiCallBehavior::Sync, nullptr); + deviceStream_, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts, coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads, - commandStream_, GpuApiCallBehavior::Sync, nullptr); + deviceStream_, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(), 0, maxCoupledConstraints * kernelParams_.numConstraintsThreads, - commandStream_, GpuApiCallBehavior::Sync, nullptr); + deviceStream_, GpuApiCallBehavior::Sync, nullptr); copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0, - maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_, + maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync, nullptr); GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n"); - copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_, + copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr); } diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh index 4817573b80..ef03516431 100644 --- a/src/gromacs/mdlib/lincs_gpu.cuh +++ b/src/gromacs/mdlib/lincs_gpu.cuh @@ -105,9 +105,12 @@ public: * \param[in] numIterations Number of iteration for the correction of the projection. * \param[in] expansionOrder Order of the matrix inversion algorithm. * \param[in] deviceContext Device context (dummy in CUDA). - * \param[in] commandStream Device command stream. + * \param[in] deviceStream Device command stream. */ - LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream); + LincsGpu(int numIterations, + int expansionOrder, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream); /*! \brief Destructor.*/ ~LincsGpu(); @@ -172,7 +175,7 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! GPU stream - CommandStream commandStream_; + const DeviceStream& deviceStream_; //! Parameters and pointers, passed to the GPU kernel LincsGpuKernelParameters kernelParams_; diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu index 20933baf96..76daf34c1a 100644 --- a/src/gromacs/mdlib/settle_gpu.cu +++ b/src/gromacs/mdlib/settle_gpu.cu @@ -434,7 +434,7 @@ void SettleGpu::apply(const float3* d_x, { // Fill with zeros so the values can be reduced to it // Only 6 values are needed because virial is symmetrical - clearDeviceBufferAsync(&d_virialScaled_, 0, 6, commandStream_); + clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_); } auto kernelPtr = getSettleKernelPtr(updateVelocities, computeVirial); @@ -455,7 +455,7 @@ void SettleGpu::apply(const float3* d_x, { config.sharedMemorySize = 0; } - config.stream = commandStream_; + config.stream = deviceStream_.stream(); const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_, &settleParameters_, &d_x, &d_xp, &invdt, &d_v, @@ -465,7 +465,7 @@ void SettleGpu::apply(const float3* d_x, if (computeVirial) { - copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, commandStream_, + copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_, GpuApiCallBehavior::Sync, nullptr); // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object @@ -485,9 +485,9 @@ void SettleGpu::apply(const float3* d_x, return; } -SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) : +SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) : deviceContext_(deviceContext), - commandStream_(commandStream) + deviceStream_(deviceStream) { static_assert(sizeof(real) == sizeof(float), "Real numbers should be in single precision in GPU code."); @@ -622,7 +622,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus settler.z = iatoms[i * nral1 + 3]; // Second hydrogen index h_atomIds_.at(i) = settler; } - copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, commandStream_, + copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); } diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh index da8bafd8df..24584f7a4b 100644 --- a/src/gromacs/mdlib/settle_gpu.cuh +++ b/src/gromacs/mdlib/settle_gpu.cuh @@ -202,9 +202,9 @@ public: * target O-H and H-H distances. These values are also checked for * consistency. * \param[in] deviceContext Device context (dummy in CUDA). - * \param[in] commandStream Device stream to use. + * \param[in] deviceStream Device stream to use. */ - SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream); + SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream); ~SettleGpu(); @@ -255,7 +255,7 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! GPU stream - CommandStream commandStream_; + const DeviceStream& deviceStream_; //! Scaled virial tensor (9 reals, GPU) std::vector h_virialScaled_; diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu index 5c0a007ee4..00672af606 100644 --- a/src/gromacs/mdlib/tests/constrtestrunners.cu +++ b/src/gromacs/mdlib/tests/constrtestrunners.cu @@ -72,9 +72,10 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc) { DeviceInformation deviceInfo; const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false); auto lincsGpu = std::make_unique(testData->ir_.nLincsIter, testData->ir_.nProjOrder, - deviceContext, nullptr); + deviceContext, deviceStream); bool updateVelocities = true; int numAtoms = testData->numAtoms_; @@ -88,23 +89,23 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc) allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); allocateDeviceBuffer(&d_v, numAtoms, deviceContext); - copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr, + copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, nullptr, + copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, nullptr, + copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } lincsGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_, testData->virialScaled_, pbcAiuc); - copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, nullptr, + copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, nullptr, + copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu index b794149ddb..7b2e22aac2 100644 --- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu +++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu @@ -68,6 +68,7 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps) { DeviceInformation deviceInfo; const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false); int numAtoms = testData->numAtoms_; @@ -83,12 +84,12 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps) allocateDeviceBuffer(&d_v, numAtoms, deviceContext); allocateDeviceBuffer(&d_f, numAtoms, deviceContext); - copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); - auto integrator = std::make_unique(deviceContext, nullptr); + auto integrator = std::make_unique(deviceContext, deviceStream); integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC); @@ -104,8 +105,8 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps) testData->dtPressureCouple_, testData->velocityScalingMatrix_); } - copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); freeDeviceBuffer(&d_x); freeDeviceBuffer(&d_xp); diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu index 6ebc6688da..741d2951aa 100644 --- a/src/gromacs/mdlib/tests/settletestrunners.cu +++ b/src/gromacs/mdlib/tests/settletestrunners.cu @@ -88,8 +88,9 @@ void applySettleGpu(SettleTestData* testData, DeviceInformation deviceInfo; const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false); - auto settleGpu = std::make_unique(testData->mtop_, deviceContext, nullptr); + auto settleGpu = std::make_unique(testData->mtop_, deviceContext, deviceStream); settleGpu->set(*testData->idef_, testData->mdatoms_); PbcAiuc pbcAiuc; @@ -107,19 +108,20 @@ void applySettleGpu(SettleTestData* testData, allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); allocateDeviceBuffer(&d_v, numAtoms, deviceContext); - copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } settleGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial, testData->virial_, pbcAiuc); - copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, deviceStream, + GpuApiCallBehavior::Sync, nullptr); } freeDeviceBuffer(&d_x); diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h index 61f8537efa..c0a5e9b21a 100644 --- a/src/gromacs/mdlib/update_constrain_gpu.h +++ b/src/gromacs/mdlib/update_constrain_gpu.h @@ -50,8 +50,8 @@ #include "gromacs/utility/classhelpers.h" class DeviceContext; +class DeviceStream; class GpuEventSynchronizer; - struct gmx_mtop_t; enum class PbcType : int; class InteractionDefinitions; @@ -68,7 +68,7 @@ class UpdateConstrainGpu public: /*! \brief Create Update-Constrain object. * - * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain + * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify * coordinates. The event is maintained outside this class and also passed to all (if any) consumers * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the @@ -79,13 +79,13 @@ public: * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms * and target O-H and H-H distances from this object. * \param[in] deviceContext GPU device context. - * \param[in] commandStream GPU stream to use. Can be nullptr. + * \param[in] deviceStream GPU stream to use. * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. */ UpdateConstrainGpu(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, - const void* commandStream, + const DeviceStream& deviceStream, GpuEventSynchronizer* xUpdatedOnDevice); ~UpdateConstrainGpu(); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp index 45a0743384..76f3a0eedf 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp @@ -58,7 +58,7 @@ class UpdateConstrainGpu::Impl UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir */, const gmx_mtop_t& /* mtop */, const DeviceContext& /* deviceContext */, - const void* /* commandStream */, + const DeviceStream& /* deviceStream */, GpuEventSynchronizer* /* xUpdatedOnDevice */) : impl_(nullptr) { diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index 41f7572332..eed9e44d63 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -119,7 +119,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead clear_mat(virial); // Make sure that the forces are ready on device before proceeding with the update. - fReadyOnDevice->enqueueWaitEvent(commandStream_); + fReadyOnDevice->enqueueWaitEvent(deviceStream_); // The integrate should save a copy of the current coordinates in d_xp_ and write updated once // into d_x_. The d_xp_ is only needed by constraints. @@ -141,7 +141,7 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fRead } } - coordinatesReady_->markEvent(commandStream_); + coordinatesReady_->markEvent(deviceStream_); return; } @@ -162,31 +162,30 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) "scaleCoordinates_kernel", kernelArgs); // TODO: Although this only happens on the pressure coupling steps, this synchronization // can affect the perfornamce if nstpcouple is small. - gpuStreamSynchronize(commandStream_); + deviceStream_.synchronize(); } UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, - const void* commandStream, + const DeviceStream& deviceStream, GpuEventSynchronizer* xUpdatedOnDevice) : deviceContext_(deviceContext), + deviceStream_(deviceStream), coordinatesReady_(xUpdatedOnDevice) { GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr."); - commandStream != nullptr ? commandStream_ = *static_cast(commandStream) - : commandStream_ = nullptr; - integrator_ = std::make_unique(deviceContext_, commandStream_); - lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_); - settleGpu_ = std::make_unique(mtop, deviceContext_, commandStream_); + integrator_ = std::make_unique(deviceContext_, deviceStream_); + lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_); + settleGpu_ = std::make_unique(mtop, deviceContext_, deviceStream_); coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock; coordinateScalingKernelLaunchConfig_.blockSize[1] = 1; coordinateScalingKernelLaunchConfig_.blockSize[2] = 1; coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0; - coordinateScalingKernelLaunchConfig_.stream = commandStream_; + coordinateScalingKernelLaunchConfig_.stream = deviceStream_.stream(); } UpdateConstrainGpu::Impl::~Impl() {} @@ -235,9 +234,9 @@ GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync() UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, - const void* commandStream, + const DeviceStream& deviceStream, GpuEventSynchronizer* xUpdatedOnDevice) : - impl_(new Impl(ir, mtop, deviceContext, commandStream, xUpdatedOnDevice)) + impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice)) { } diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index dd46010e93..8aacc28a61 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -65,7 +65,7 @@ class UpdateConstrainGpu::Impl public: /*! \brief Create Update-Constrain object. * - * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain + * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify * coordinates. The event is maintained outside this class and also passed to all (if any) consumers * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the @@ -76,13 +76,13 @@ public: * \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms * and target O-H and H-H distances from this object. * \param[in] deviceContext GPU device context. - * \param[in] commandStream GPU stream to use. Can be nullptr. + * \param[in] deviceStream GPU stream to use. * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU. */ Impl(const t_inputrec& ir, const gmx_mtop_t& mtop, const DeviceContext& deviceContext, - const void* commandStream, + const DeviceStream& deviceStream, GpuEventSynchronizer* xUpdatedOnDevice); ~Impl(); @@ -171,7 +171,7 @@ private: //! GPU context object const DeviceContext& deviceContext_; //! GPU stream - CommandStream commandStream_ = nullptr; + const DeviceStream& deviceStream_; //! GPU kernel launch config KernelLaunchConfig coordinateScalingKernelLaunchConfig_; diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 941a7030c9..91360ccd28 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -403,9 +403,10 @@ void gmx::LegacySimulator::do_md() GMX_RELEASE_ASSERT(fr->deviceContext != nullptr, "GPU device context should be initialized to use GPU update."); - + GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr, + "Update stream can not be nullptr when update is on a GPU."); integrator = std::make_unique(*ir, *top_global, *fr->deviceContext, - stateGpu->getUpdateStream(), + *stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice()); integrator->setPbc(PbcType::Xyz, state->box); @@ -867,14 +868,20 @@ void gmx::LegacySimulator::do_md() && useGpuForNonbonded && is1D(*cr->dd)) { // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093 - void* streamLocal = + const DeviceStream* localStream = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local); - void* streamNonLocal = Nbnxm::gpu_get_command_stream( + const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream( fr->nbv->gpu_nbv, InteractionLocality::NonLocal); GMX_RELEASE_ASSERT( fr->deviceContext != nullptr, "GPU device context should be initialized to use GPU halo exchange."); - constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal); + GMX_RELEASE_ASSERT(localStream != nullptr, + "Local non-bonded stream can't be nullptr when using GPU " + "halo exchange."); + GMX_RELEASE_ASSERT(nonLocalStream != nullptr, + "Non-local non-bonded stream can't be nullptr when using " + "GPU halo exchange."); + constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream); } } } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 96e157ca07..753a43ab35 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1373,7 +1373,9 @@ int Mdrunner::mdrunner() GMX_RELEASE_ASSERT( fr->deviceContext != nullptr, "Device context can not be nullptr when computing bonded interactions on GPU."); - gpuBonded = std::make_unique(mtop.ffparams, *fr->deviceContext, stream, wcycle); + GMX_RELEASE_ASSERT(stream != nullptr, + "Can'r run GPU version of bonded forces in nullptr stream."); + gpuBonded = std::make_unique(mtop.ffparams, *fr->deviceContext, *stream, wcycle); fr->gpuBonded = gpuBonded.get(); } @@ -1584,12 +1586,12 @@ int Mdrunner::mdrunner() && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || runScheduleWork.simulationWork.useGpuBufferOps)) { - const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata); - const void* localStream = + const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata); + const DeviceStream* localStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local) : nullptr; - const void* nonLocalStream = + const DeviceStream* nonLocalStream = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal) : nullptr; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 034e7eb604..678fa33681 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -60,6 +60,7 @@ #include "locality.h" class DeviceContext; +class DeviceStream; class GpuEventSynchronizer; struct gmx_wallcycle; @@ -99,9 +100,6 @@ public: * \p pmeStream argument needs to be a valid OpenCL queue object * which must have been created in \p deviceContext. * - * \todo Make a \p CommandStream visible in the CPU parts of the code so we - * will not have to pass a void*. - * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed. @@ -110,9 +108,9 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - StatePropagatorDataGpu(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, + StatePropagatorDataGpu(const DeviceStream* pmeStream, + const DeviceStream* localStream, + const DeviceStream* nonLocalStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -134,7 +132,7 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - StatePropagatorDataGpu(const void* pmeStream, + StatePropagatorDataGpu(const DeviceStream* pmeStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -329,7 +327,7 @@ public: * * \returns The device command stream to use in update-constraints. */ - void* getUpdateStream(); + const DeviceStream* getUpdateStream(); /*! \brief Getter for the number of local atoms. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 1029dd220f..78b1fd3a4a 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -54,9 +54,9 @@ class StatePropagatorDataGpu::Impl { }; -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */, - const void* /* localStream */, - const void* /* nonLocalStream */, +StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */, + const DeviceStream* /* localStream */, + const DeviceStream* /* nonLocalStream */, const DeviceContext& /* deviceContext */, GpuApiCallBehavior /* transferKind */, int /* paddingSize */, @@ -65,7 +65,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */ { } -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */, +StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */, const DeviceContext& /* deviceContext */, GpuApiCallBehavior /* transferKind */, int /* paddingSize */, @@ -242,7 +242,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality /* atomLocality } -void* StatePropagatorDataGpu::getUpdateStream() +const DeviceStream* StatePropagatorDataGpu::getUpdateStream() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from " diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index 679bf2544a..1b2c91d2e2 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -99,9 +99,6 @@ public: * \p pmeStream argument needs to be a valid OpenCL queue object * which must have been created in \p deviceContext. * - * \todo Make a \p CommandStream visible in the CPU parts of the code so we - * will not have to pass a void*. - * * \param[in] pmeStream Device PME stream, nullptr allowed. * \param[in] localStream Device NBNXM local stream, nullptr allowed. * \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed. @@ -110,9 +107,9 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - Impl(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, + Impl(const DeviceStream* pmeStream, + const DeviceStream* localStream, + const DeviceStream* nonLocalStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -134,7 +131,7 @@ public: * \param[in] paddingSize Padding size for coordinates buffer. * \param[in] wcycle Wall cycle counter data. */ - Impl(const void* pmeStream, + Impl(const DeviceStream* pmeStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -325,7 +322,7 @@ public: * * \returns The device command stream to use in update-constraints. */ - void* getUpdateStream(); + const DeviceStream* getUpdateStream(); /*! \brief Getter for the number of local atoms. * @@ -341,20 +338,23 @@ public: private: //! GPU PME stream. - CommandStream pmeStream_ = nullptr; + const DeviceStream* pmeStream_; //! GPU NBNXM local stream. - CommandStream localStream_ = nullptr; - //! GPU NBNXM non-local stream - CommandStream nonLocalStream_ = nullptr; + const DeviceStream* localStream_; + //! GPU NBNXM non-local stream. + const DeviceStream* nonLocalStream_; //! GPU Update-constreaints stream. - CommandStream updateStream_ = nullptr; + const DeviceStream* updateStream_; + + //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary. + DeviceStream updateStreamOwn_; // Streams to use for coordinates H2D and D2H copies (one event for each atom locality) - EnumerationArray xCopyStreams_ = { { nullptr } }; + EnumerationArray xCopyStreams_ = { { nullptr } }; // Streams to use for velocities H2D and D2H copies (one event for each atom locality) - EnumerationArray vCopyStreams_ = { { nullptr } }; + EnumerationArray vCopyStreams_ = { { nullptr } }; // Streams to use for forces H2D and D2H copies (one event for each atom locality) - EnumerationArray fCopyStreams_ = { { nullptr } }; + EnumerationArray fCopyStreams_ = { { nullptr } }; /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality) * @@ -422,13 +422,13 @@ private: * \param[in] h_data Host-side buffer. * \param[in] dataSize Device-side data allocation size. * \param[in] atomLocality If all, local or non-local ranges should be copied. - * \param[in] commandStream GPU stream to execute copy in. + * \param[in] deviceStream GPU stream to execute copy in. */ void copyToDevice(DeviceBuffer d_data, gmx::ArrayRef h_data, int dataSize, AtomLocality atomLocality, - CommandStream commandStream); + const DeviceStream& deviceStream); /*! \brief Performs the copy of data from device to host buffer. * @@ -436,13 +436,13 @@ private: * \param[in] d_data Device-side buffer. * \param[in] dataSize Device-side data allocation size. * \param[in] atomLocality If all, local or non-local ranges should be copied. - * \param[in] commandStream GPU stream to execute copy in. + * \param[in] deviceStream GPU stream to execute copy in. */ void copyFromDevice(gmx::ArrayRef h_data, DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, - CommandStream commandStream); + const DeviceStream& deviceStream); }; } // namespace gmx diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index d88f469711..b1fefd34a5 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -65,9 +65,9 @@ namespace gmx { -StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, +StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream, + const DeviceStream* localStream, + const DeviceStream* nonLocalStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -86,8 +86,8 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds."); // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context. - pmeStream_ = *static_cast(pmeStream); - updateStream_ = *static_cast(pmeStream); + pmeStream_ = pmeStream; + updateStream_ = pmeStream; GMX_UNUSED_VALUE(localStream); GMX_UNUSED_VALUE(nonLocalStream); } @@ -96,21 +96,24 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, { if (pmeStream != nullptr) { - pmeStream_ = *static_cast(pmeStream); + pmeStream_ = pmeStream; } if (localStream != nullptr) { - localStream_ = *static_cast(localStream); + localStream_ = localStream; } if (nonLocalStream != nullptr) { - nonLocalStream_ = *static_cast(nonLocalStream); + nonLocalStream_ = nonLocalStream; } // TODO: The update stream should be created only when it is needed. # if (GMX_GPU == GMX_GPU_CUDA) - cudaError_t stat; - stat = cudaStreamCreate(&updateStream_); + cudaError_t stat; + cudaStream_t stream; + stat = cudaStreamCreate(&stream); + updateStreamOwn_.setStream(stream); + updateStream_ = &updateStreamOwn_; CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); # endif } @@ -131,7 +134,7 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, fCopyStreams_[AtomLocality::All] = updateStream_; } -StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, +StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -145,9 +148,8 @@ StatePropagatorDataGpu::Impl::Impl(const void* pmeStream, "This object should only be constructed on the GPU code-paths."); GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set."); - pmeStream_ = *static_cast(pmeStream); - - localStream_ = nullptr; + pmeStream_ = pmeStream; + localStream_ = pmeStream; // For clearing the force buffer nonLocalStream_ = nullptr; updateStream_ = nullptr; @@ -193,7 +195,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) if (paddingAllocationSize > 0) { // The PME stream is used here because the padding region of d_x_ is only in the PME task. - clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_); + clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_); } reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_); @@ -204,7 +206,7 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) // since the force buffer ops are not implemented in OpenCL. if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity) { - clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_); + clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_); } wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -247,7 +249,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer const gmx::ArrayRef h_data, int dataSize, AtomLocality atomLocality, - CommandStream commandStream) + const DeviceStream& deviceStream) { GMX_UNUSED_VALUE(dataSize); @@ -255,7 +257,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated."); - GMX_ASSERT(commandStream != nullptr, + GMX_ASSERT(deviceStream.stream() != nullptr, "No stream is valid for copying with given atom locality."); wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -271,7 +273,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer "The host buffer is smaller than the requested copy range."); copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), - atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr); + atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr); } wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -282,7 +284,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, - CommandStream commandStream) + const DeviceStream& deviceStream) { GMX_UNUSED_VALUE(dataSize); @@ -290,7 +292,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated."); - GMX_ASSERT(commandStream != nullptr, + GMX_ASSERT(deviceStream.stream() != nullptr, "No stream is valid for copying with given atom locality."); wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -306,7 +308,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat "The host buffer is smaller than the requested copy range."); copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, - atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr); + atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr); } wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -322,14 +324,14 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, AtomLocality atomLocality) { GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); - CommandStream commandStream = xCopyStreams_[atomLocality]; - GMX_ASSERT(commandStream != nullptr, + const DeviceStream* deviceStream = xCopyStreams_[atomLocality]; + GMX_ASSERT(deviceStream != nullptr, "No stream is valid for copying positions with given atom locality."); wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); - copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream); + copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream); // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed. - xReadyOnHost_[atomLocality].markEvent(commandStream); + xReadyOnHost_[atomLocality].markEvent(*deviceStream); wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); wallcycle_stop(wcycle_, ewcLAUNCH_GPU); @@ -419,15 +421,15 @@ void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef h_v, AtomLocality atomLocality) { GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); - CommandStream commandStream = vCopyStreams_[atomLocality]; - GMX_ASSERT(commandStream != nullptr, + const DeviceStream* deviceStream = vCopyStreams_[atomLocality]; + GMX_ASSERT(deviceStream != nullptr, "No stream is valid for copying velocities with given atom locality."); wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); - copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream); - vReadyOnHost_[atomLocality].markEvent(commandStream); + copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream); + vReadyOnHost_[atomLocality].markEvent(*deviceStream); wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); wallcycle_stop(wcycle_, ewcLAUNCH_GPU); @@ -473,15 +475,15 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality) { GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); - CommandStream commandStream = fCopyStreams_[atomLocality]; - GMX_ASSERT(commandStream != nullptr, + const DeviceStream* deviceStream = fCopyStreams_[atomLocality]; + GMX_ASSERT(deviceStream != nullptr, "No stream is valid for copying forces with given atom locality."); wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); - copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream); - fReadyOnHost_[atomLocality].markEvent(commandStream); + copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream); + fReadyOnHost_[atomLocality].markEvent(*deviceStream); wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); wallcycle_stop(wcycle_, ewcLAUNCH_GPU); @@ -529,9 +531,9 @@ void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocali wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA); } -void* StatePropagatorDataGpu::Impl::getUpdateStream() +const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream() { - return &updateStream_; + return updateStream_; } int StatePropagatorDataGpu::Impl::numAtomsLocal() @@ -545,9 +547,9 @@ int StatePropagatorDataGpu::Impl::numAtomsAll() } -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, - const void* localStream, - const void* nonLocalStream, +StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream, + const DeviceStream* localStream, + const DeviceStream* nonLocalStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -556,7 +558,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, { } -StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream, +StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream, const DeviceContext& deviceContext, GpuApiCallBehavior transferKind, int paddingSize, @@ -682,7 +684,7 @@ void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality) } -void* StatePropagatorDataGpu::getUpdateStream() +const DeviceStream* StatePropagatorDataGpu::getUpdateStream() { return impl_->getUpdateStream(); } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index e47834cb9a..54ce9f331b 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int num_thre */ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality) { - cudaStream_t stream = nb->stream[interactionLocality]; + const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality]; /* When we get here all misc operations issued in the local stream as well as the local xq H2D are done, @@ -378,12 +378,13 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal { if (interactionLocality == InteractionLocality::Local) { - cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream); + cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream()); CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed"); } else { - cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0); + cudaError_t stat = + cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0); CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed"); } } @@ -401,10 +402,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ - cu_atomdata_t* adat = nb->atdat; - cu_plist_t* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; - cudaStream_t stream = nb->stream[iloc]; + cu_atomdata_t* adat = nb->atdat; + cu_plist_t* plist = nb->plist[iloc]; + cu_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -440,16 +441,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* beginning of timed HtoD section */ if (bDoTime) { - t->xf[atomLocality].nb_h2d.openTimingRegion(stream); + t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } cu_copy_H2D_async(adat->xq + adat_begin, static_cast(nbatom->x().data() + adat_begin * 4), - adat_len * sizeof(*adat->xq), stream); + adat_len * sizeof(*adat->xq), deviceStream.stream()); if (bDoTime) { - t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); + t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); } /* When we get here all misc operations issued in the local stream as well as @@ -480,11 +481,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom */ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) { - cu_atomdata_t* adat = nb->atdat; - cu_nbparam_t* nbp = nb->nbparam; - cu_plist_t* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; - cudaStream_t stream = nb->stream[iloc]; + cu_atomdata_t* adat = nb->atdat; + cu_nbparam_t* nbp = nb->nbparam; + cu_plist_t* plist = nb->plist[iloc]; + cu_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -522,7 +523,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->interaction[iloc].nb_k.openTimingRegion(stream); + t->interaction[iloc].nb_k.openTimingRegion(deviceStream); } /* Kernel launch config: @@ -544,7 +545,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In config.blockSize[2] = num_threads_z; config.gridSize[0] = nblock; config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp); - config.stream = stream; + config.stream = deviceStream.stream(); if (debug) { @@ -567,13 +568,13 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In if (bDoTime) { - t->interaction[iloc].nb_k.closeTimingRegion(stream); + t->interaction[iloc].nb_k.closeTimingRegion(deviceStream); } if (GMX_NATIVE_WINDOWS) { /* Windows: force flushing WDDM queue */ - cudaStreamQuery(stream); + cudaStreamQuery(deviceStream.stream()); } } @@ -592,11 +593,11 @@ static inline int calc_shmem_required_prune(const int num_threads_z) void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { - cu_atomdata_t* adat = nb->atdat; - cu_nbparam_t* nbp = nb->nbparam; - cu_plist_t* plist = nb->plist[iloc]; - cu_timers_t* t = nb->timers; - cudaStream_t stream = nb->stream[iloc]; + cu_atomdata_t* adat = nb->atdat; + cu_nbparam_t* nbp = nb->nbparam; + cu_plist_t* plist = nb->plist[iloc]; + cu_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -652,7 +653,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c /* beginning of timed prune calculation section */ if (bDoTime) { - timer->openTimingRegion(stream); + timer->openTimingRegion(deviceStream); } /* Kernel launch config: @@ -668,7 +669,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c config.blockSize[2] = num_threads_z; config.gridSize[0] = nblock; config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); - config.stream = stream; + config.stream = deviceStream.stream(); if (debug) { @@ -704,13 +705,13 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c if (bDoTime) { - timer->closeTimingRegion(stream); + timer->closeTimingRegion(deviceStream); } if (GMX_NATIVE_WINDOWS) { /* Windows: force flushing WDDM queue */ - cudaStreamQuery(stream); + cudaStreamQuery(deviceStream.stream()); } } @@ -728,10 +729,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); /* extract the data */ - cu_atomdata_t* adat = nb->atdat; - cu_timers_t* t = nb->timers; - bool bDoTime = nb->bDoTime; - cudaStream_t stream = nb->stream[iloc]; + cu_atomdata_t* adat = nb->atdat; + cu_timers_t* t = nb->timers; + bool bDoTime = nb->bDoTime; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) @@ -744,14 +745,14 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* beginning of timed D2H section */ if (bDoTime) { - t->xf[atomLocality].nb_d2h.openTimingRegion(stream); + t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); } /* With DD the local D2H transfer can only start after the non-local kernel has finished. */ if (iloc == InteractionLocality::Local && nb->bUseTwoStreams) { - stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0); + stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0); CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed"); } @@ -761,7 +762,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, if (!stepWork.useGpuFBufferOps) { cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, - (adat_len) * sizeof(*adat->f), stream); + (adat_len) * sizeof(*adat->f), deviceStream.stream()); } /* After the non-local D2H is launched the nonlocal_done event can be @@ -770,7 +771,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, back first. */ if (iloc == InteractionLocality::NonLocal) { - stat = cudaEventRecord(nb->nonlocal_done, stream); + stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream()); CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed"); } @@ -780,20 +781,21 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream); + cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), + deviceStream.stream()); } /* DtoH energies */ if (stepWork.computeEnergy) { - cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream); - cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), stream); + cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream()); + cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream()); } } if (bDoTime) { - t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); + t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); } } @@ -834,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, const int numAtomsPerCell = grid.numAtomsPerCell(); Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality); - cudaStream_t stream = nb->stream[interactionLoc]; + const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc]; int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); // avoid empty kernel launch, skip to inserting stream dependency @@ -845,7 +847,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, // ensure that coordinates are ready on the device before launching the kernel GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object"); - xReadyOnDevice->enqueueWaitEvent(stream); + xReadyOnDevice->enqueueWaitEvent(deviceStream); KernelLaunchConfig config; config.blockSize[0] = c_bufOpsThreadsPerBlock; @@ -858,7 +860,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this"); config.sharedMemorySize = 0; - config.stream = stream; + config.stream = deviceStream.stream(); auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel : nbnxn_gpu_x_to_nbat_x_kernel; @@ -873,7 +875,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs); } - // TODO: note that this is not necessary when there are no local atoms, that is: + // TODO: note that this is not necessary when there astreamre no local atoms, that is: // (numAtoms == 0 && interactionLoc == InteractionLocality::Local) // but for now we avoid that optimization nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); @@ -899,9 +901,9 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms"); GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer"); - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - cudaStream_t stream = nb->stream[iLocality]; - cu_atomdata_t* adat = nb->atdat; + const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); + const DeviceStream& deviceStream = nb->deviceStreams[iLocality]; + cu_atomdata_t* adat = nb->atdat; size_t gmx_used_in_debug numDependency = static_cast((useGpuFPmeReduction == true)) + static_cast((accumulateForce == true)); @@ -911,7 +913,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo // Enqueue wait on all dependencies passed for (auto const synchronizer : dependencyList) { - synchronizer->enqueueWaitEvent(stream); + synchronizer->enqueueWaitEvent(deviceStream); } /* launch kernel */ @@ -924,7 +926,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = stream; + config.stream = deviceStream.stream(); auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; @@ -950,7 +952,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo { GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer"); - nb->localFReductionDone->markEvent(stream); + nb->localFReductionDone->markEvent(deviceStream); } } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 666aefc629..cbd5f8ce01 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -448,7 +448,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, nb->deviceInfo = deviceInfo; /* local/non-local GPU streams */ - stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]); + cudaStream_t localStream; + stat = cudaStreamCreate(&localStream); + nb->deviceStreams[InteractionLocality::Local].setStream(localStream); CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed"); if (nb->bUseTwoStreams) { @@ -462,8 +464,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority); CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed"); - stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal], - cudaStreamDefault, highest_priority); + cudaStream_t nonLocalStream; + stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority); + nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream); CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed"); } @@ -512,10 +515,10 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc) { - char sbuf[STRLEN]; - bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); - cudaStream_t stream = nb->stream[iloc]; - cu_plist_t* d_plist = nb->plist[iloc]; + char sbuf[STRLEN]; + bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; + cu_plist_t* d_plist = nb->plist[iloc]; if (d_plist->na_c < 0) { @@ -535,18 +538,18 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte if (bDoTime) { - iTimers.pl_h2d.openTimingRegion(stream); + iTimers.pl_h2d.openTimingRegion(deviceStream); iTimers.didPairlistH2D = true; } reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, DeviceContext()); - copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, + copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, DeviceContext()); - copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, + copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit, @@ -554,12 +557,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, DeviceContext()); - copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, + copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - iTimers.pl_h2d.closeTimingRegion(stream); + iTimers.pl_h2d.closeTimingRegion(deviceStream); } /* the next use of thist list we be the first one, so we need to prune */ @@ -569,7 +572,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->stream[InteractionLocality::Local]; + cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream(); /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) @@ -584,7 +587,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) { cudaError_t stat; cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->stream[InteractionLocality::Local]; + cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream(); stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls); CU_RET_ERR(stat, "cudaMemsetAsync on f falied"); @@ -595,7 +598,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb) { cudaError_t stat; cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->stream[InteractionLocality::Local]; + cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream(); stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls); CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied"); @@ -618,13 +621,13 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { - cudaError_t stat; - int nalloc, natoms; - bool realloced; - bool bDoTime = nb->bDoTime; - cu_timers_t* timers = nb->timers; - cu_atomdata_t* d_atdat = nb->atdat; - cudaStream_t ls = nb->stream[InteractionLocality::Local]; + cudaError_t stat; + int nalloc, natoms; + bool realloced; + bool bDoTime = nb->bDoTime; + cu_timers_t* timers = nb->timers; + cu_atomdata_t* d_atdat = nb->atdat; + const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -632,7 +635,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (bDoTime) { /* time async copy */ - timers->atdat.openTimingRegion(ls); + timers->atdat.openTimingRegion(deviceStream); } /* need to reallocate if we have to copy more atoms than the amount of space @@ -681,17 +684,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam)) { cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), - natoms * sizeof(*d_atdat->lj_comb), ls); + natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream()); } else { cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), - natoms * sizeof(*d_atdat->atom_types), ls); + natoms * sizeof(*d_atdat->atom_types), deviceStream.stream()); } if (bDoTime) { - timers->atdat.closeTimingRegion(ls); + timers->atdat.closeTimingRegion(deviceStream); } } @@ -725,15 +728,6 @@ void gpu_free(NbnxmGpu* nb) CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done"); delete nb->timers; - if (nb->bDoTime) - { - /* The non-local counters/stream (second in the array) are needed only with DD. */ - for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++) - { - stat = cudaStreamDestroy(nb->stream[i]); - CU_RET_ERR(stat, "cudaStreamDestroy failed on stream"); - } - } if (!useLjCombRule(nb->nbparam)) { @@ -822,11 +816,11 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb) return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN)); } -void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc) +const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc) { assert(nb); - return static_cast(&nb->stream[iloc]); + return &nb->deviceStreams[iloc]; } void* gpu_get_xq(NbnxmGpu* nb) @@ -854,9 +848,9 @@ DeviceBuffer gpu_get_fshift(NbnxmGpu* nb) /* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv) { - cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; - bool bDoTime = gpu_nbv->bDoTime; - const int maxNumColumns = gridSet.numColumnsMax(); + const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local]; + bool bDoTime = gpu_nbv->bDoTime; + const int maxNumColumns = gridSet.numColumnsMax(); reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(), &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext()); @@ -882,15 +876,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream); } - copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, + copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream, GpuApiCallBehavior::Async, nullptr); if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream); } } @@ -898,28 +892,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv { if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream); } int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g]; - copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); + copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream, + GpuApiCallBehavior::Async, nullptr); if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream); } if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream); } destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g]; - copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr); + copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream, + GpuApiCallBehavior::Async, nullptr); if (bDoTime) { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream); + gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream); } } } @@ -943,7 +939,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell, GpuEventSynchronizer* const localReductionDone) { - cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; + const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local]; GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer"); gpu_nbv->localFReductionDone = localReductionDone; @@ -952,7 +948,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell, { reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, DeviceContext()); - copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr); + copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream, + GpuApiCallBehavior::Async, nullptr); } return; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index da607e4429..d2bbfa6b8e 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -303,7 +303,7 @@ struct NbnxmGpu /*! \brief staging area where fshift/energies get downloaded */ nb_staging_t nbst; /*! \brief local and non-local GPU streams */ - gmx::EnumerationArray stream = { { nullptr } }; + gmx::EnumerationArray deviceStreams; /*! \brief Events used for synchronization */ /*! \{ */ diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 65f38daea2..f5b3d813da 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu* nb, // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller. wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L); - if (!haveStreamTasksCompleted(nb->stream[iLocality])) + if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality])) { wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); @@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu* nb, } else if (haveResultToWaitFor) { - gpuStreamSynchronize(nb->stream[iLocality]); + nb->deviceStreams[iLocality].synchronize(); } // TODO: this needs to be moved later because conditional wait could brake timing diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h index 822852786c..e242771862 100644 --- a/src/gromacs/nbnxm/gpu_data_mgmt.h +++ b/src/gromacs/nbnxm/gpu_data_mgmt.h @@ -51,6 +51,7 @@ #include "gromacs/mdtypes/locality.h" class DeviceContext; +class DeviceStream; struct NbnxmGpu; struct gmx_gpu_info_t; @@ -126,7 +127,7 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu gmx_unused* nb) GPU_FUNC_ * Note: CUDA only. */ CUDA_FUNC_QUALIFIER -void* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc) +const DeviceStream* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc) CUDA_FUNC_TERM_WITH_RETURN(nullptr); /** Returns an opaque pointer to the GPU coordinate+charge array diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 013bd093a0..ba0c2ee939 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -484,10 +484,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* local/nonlocal offset and length used for xq and f */ int adat_begin, adat_len; - cl_atomdata_t* adat = nb->atdat; - cl_plist_t* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; - cl_command_queue stream = nb->stream[iloc]; + cl_atomdata_t* adat = nb->atdat; + cl_plist_t* plist = nb->plist[iloc]; + cl_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -522,17 +522,17 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* beginning of timed HtoD section */ if (bDoTime) { - t->xf[atomLocality].nb_h2d.openTimingRegion(stream); + t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } /* HtoD x, q */ - ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, - adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream, + ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4, + adat_len * sizeof(float) * 4, deviceStream.stream(), bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); + t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); } /* When we get here all misc operations issues in the local stream as well as @@ -543,7 +543,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom if (iloc == InteractionLocality::Local) { cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList( - stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done)); + deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done)); GMX_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); @@ -551,13 +551,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom * in the local stream in order to be able to sync with the above event * from the non-local stream. */ - cl_error = clFlush(stream); + cl_error = clFlush(deviceStream.stream()); GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); } else { - sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done)); + sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done)); } } } @@ -583,11 +583,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom */ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) { - cl_atomdata_t* adat = nb->atdat; - cl_nbparam_t* nbp = nb->nbparam; - cl_plist_t* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; - cl_command_queue stream = nb->stream[iloc]; + cl_atomdata_t* adat = nb->atdat; + cl_nbparam_t* nbp = nb->nbparam; + cl_plist_t* plist = nb->plist[iloc]; + cl_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; bool bDoTime = nb->bDoTime; @@ -628,14 +628,14 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb /* beginning of timed nonbonded calculation section */ if (bDoTime) { - t->interaction[iloc].nb_k.openTimingRegion(stream); + t->interaction[iloc].nb_k.openTimingRegion(deviceStream); } /* kernel launch config */ KernelLaunchConfig config; config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam); - config.stream = stream; + config.stream = deviceStream.stream(); config.blockSize[0] = c_clSize; config.blockSize[1] = c_clSize; config.gridSize[0] = plist->nsci; @@ -686,7 +686,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb if (bDoTime) { - t->interaction[iloc].nb_k.closeTimingRegion(stream); + t->interaction[iloc].nb_k.closeTimingRegion(deviceStream); } } @@ -722,12 +722,12 @@ static inline int calc_shmem_required_prune(const int num_threads_z) */ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { - cl_atomdata_t* adat = nb->atdat; - cl_nbparam_t* nbp = nb->nbparam; - cl_plist_t* plist = nb->plist[iloc]; - cl_timers_t* t = nb->timers; - cl_command_queue stream = nb->stream[iloc]; - bool bDoTime = nb->bDoTime; + cl_atomdata_t* adat = nb->atdat; + cl_nbparam_t* nbp = nb->nbparam; + cl_plist_t* plist = nb->plist[iloc]; + cl_timers_t* t = nb->timers; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; + bool bDoTime = nb->bDoTime; if (plist->haveFreshList) { @@ -781,7 +781,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c /* beginning of timed prune calculation section */ if (bDoTime) { - timer->openTimingRegion(stream); + timer->openTimingRegion(deviceStream); } /* Kernel launch config: @@ -795,7 +795,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c /* kernel launch config */ KernelLaunchConfig config; config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); - config.stream = stream; + config.stream = deviceStream.stream(); config.blockSize[0] = c_clSize; config.blockSize[1] = c_clSize; config.blockSize[2] = num_threads_z; @@ -840,7 +840,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c if (bDoTime) { - timer->closeTimingRegion(stream); + timer->closeTimingRegion(deviceStream); } } @@ -861,10 +861,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* determine interaction locality from atom locality */ const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc); - cl_atomdata_t* adat = nb->atdat; - cl_timers_t* t = nb->timers; - bool bDoTime = nb->bDoTime; - cl_command_queue stream = nb->stream[iloc]; + cl_atomdata_t* adat = nb->atdat; + cl_timers_t* t = nb->timers; + bool bDoTime = nb->bDoTime; + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) @@ -886,24 +886,24 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* beginning of timed D2H section */ if (bDoTime) { - t->xf[aloc].nb_d2h.openTimingRegion(stream); + t->xf[aloc].nb_d2h.openTimingRegion(deviceStream); } /* With DD the local D2H transfer can only start after the non-local has been launched. */ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive) { - sync_ocl_event(stream, &(nb->nonlocal_done)); + sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done)); } /* DtoH f */ ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f, adat_begin * DIM * sizeof(nbatom->out[0].f[0]), - adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream, + adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(), bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); /* kick off work */ - cl_error = clFlush(stream); + cl_error = clFlush(deviceStream.stream()); GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); /* After the non-local D2H is launched the nonlocal_done event can be @@ -912,7 +912,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, data back first. */ if (iloc == InteractionLocality::NonLocal) { - cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done)); + cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done)); GMX_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); nb->bNonLocalStreamActive = CL_TRUE; @@ -924,24 +924,25 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]), - stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, + SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(), + bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ if (stepWork.computeEnergy) { - ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream, + ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(), bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); - ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream, + ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(), bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } } if (bDoTime) { - t->xf[aloc].nb_d2h.closeTimingRegion(stream); + t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream); } } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index eb1234d512..fa37263a5b 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -487,7 +487,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb) cl_int cl_error; cl_atomdata_t* adat = nb->atdat; - cl_command_queue ls = nb->stream[InteractionLocality::Local]; + cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream(); size_t local_work_size[3] = { 1, 1, 1 }; size_t global_work_size[3] = { 1, 1, 1 }; @@ -606,10 +606,12 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, queue_properties = 0; } - /* local/non-local GPU streams */ - nb->stream[InteractionLocality::Local] = + cl_command_queue localStream = clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(), nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); + /* local/non-local GPU streams */ + nb->deviceStreams[InteractionLocality::Local].setStream(localStream); + if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank, @@ -620,9 +622,11 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, { init_plist(nb->plist[InteractionLocality::NonLocal]); - nb->stream[InteractionLocality::NonLocal] = + cl_command_queue nonLocalStream = clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(), nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); + nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream); + if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", @@ -675,7 +679,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) cl_int gmx_used_in_debug cl_error; cl_atomdata_t* atomData = nb->atdat; - cl_command_queue ls = nb->stream[InteractionLocality::Local]; + cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream(); cl_float value = 0.0F; cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0, @@ -697,7 +701,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) /* kick off buffer clearing kernel to ensure concurrency with constraints/update */ cl_int gmx_unused cl_error; - cl_error = clFlush(nb->stream[InteractionLocality::Local]); + cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream()); GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); } @@ -708,9 +712,9 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte // Timing accumulation should happen only if there was work to do // because getLastRangeTime() gets skipped with empty lists later // which leads to the counter not being reset. - bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); - cl_command_queue stream = nb->stream[iloc]; - cl_plist_t* d_plist = nb->plist[iloc]; + bool bDoTime = (nb->bDoTime && !h_plist->sci.empty()); + const DeviceStream& deviceStream = nb->deviceStreams[iloc]; + cl_plist_t* d_plist = nb->plist[iloc]; if (d_plist->na_c < 0) { @@ -730,7 +734,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte if (bDoTime) { - iTimers.pl_h2d.openTimingRegion(stream); + iTimers.pl_h2d.openTimingRegion(deviceStream); iTimers.didPairlistH2D = true; } @@ -739,12 +743,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext); - copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream, + copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext); - copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream, + copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit, @@ -752,12 +756,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext); - copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream, + copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr); if (bDoTime) { - iTimers.pl_h2d.closeTimingRegion(stream); + iTimers.pl_h2d.closeTimingRegion(deviceStream); } /* need to prune the pair list during the next step */ @@ -768,7 +772,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { cl_atomdata_t* adat = nb->atdat; - cl_command_queue ls = nb->stream[InteractionLocality::Local]; + cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream(); /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) @@ -782,13 +786,13 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) //! This function is documented in the header file void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { - cl_int cl_error; - int nalloc, natoms; - bool realloced; - bool bDoTime = nb->bDoTime; - cl_timers_t* timers = nb->timers; - cl_atomdata_t* d_atdat = nb->atdat; - cl_command_queue ls = nb->stream[InteractionLocality::Local]; + cl_int cl_error; + int nalloc, natoms; + bool realloced; + bool bDoTime = nb->bDoTime; + cl_timers_t* timers = nb->timers; + cl_atomdata_t* d_atdat = nb->atdat; + const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -796,7 +800,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (bDoTime) { /* time async copy */ - timers->atdat.openTimingRegion(ls); + timers->atdat.openTimingRegion(deviceStream); } /* need to reallocate if we have to copy more atoms than the amount of space @@ -859,21 +863,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwtype)) { ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2), - ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int), - ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } if (bDoTime) { - timers->atdat.closeTimingRegion(ls); + timers->atdat.closeTimingRegion(deviceStream); } /* kick off the tasks enqueued above to ensure concurrency with the search */ - cl_error = clFlush(ls); + cl_error = clFlush(deviceStream.stream()); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); } @@ -996,14 +1000,6 @@ void gpu_free(NbnxmGpu* nb) pfree(nb->nbst.fshift); nb->nbst.fshift = nullptr; - /* Free command queues */ - clReleaseCommandQueue(nb->stream[InteractionLocality::Local]); - nb->stream[InteractionLocality::Local] = nullptr; - if (nb->bUseTwoStreams) - { - clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]); - nb->stream[InteractionLocality::NonLocal] = nullptr; - } /* Free other events */ if (nb->nonlocal_done) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 6e3a410161..a3583761fa 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -361,7 +361,7 @@ struct NbnxmGpu nb_staging_t nbst; //! local and non-local GPU queues - gmx::EnumerationArray stream; + gmx::EnumerationArray deviceStreams; /*! \brief Events used for synchronization */ /*! \{ */ -- 2.22.0