void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
const t_commrec& cr,
const DeviceContext& deviceContext,
- void* streamLocal,
- void* streamNonLocal)
+ const DeviceStream& streamLocal,
+ const DeviceStream& streamNonLocal)
{
int gpuHaloExchangeSize = 0;
struct t_nrnb;
struct gmx_wallcycle;
enum class PbcType : int;
+class DeviceStream;
class t_state;
class DeviceContext;
class GpuEventSynchronizer;
void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
const t_commrec& cr,
const DeviceContext& deviceContext,
- void* streamLocal,
- void* streamNonLocal);
+ const DeviceStream& streamLocal,
+ const DeviceStream& streamNonLocal);
/*! \brief
* (Re-) Initialization for GPU halo exchange
struct gmx_domdec_t;
class DeviceContext;
+class DeviceStream;
class GpuEventSynchronizer;
namespace gmx
GpuHaloExchange(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- void* streamLocal,
- void* streamNonLocal,
+ const DeviceStream& streamLocal,
+ const DeviceStream& streamNonLocal,
int pulse);
~GpuHaloExchange();
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
MPI_Comm /* mpi_comm_mysim */,
const DeviceContext& /* deviceContext */,
- void* /*streamLocal */,
- void* /*streamNonLocal */,
+ const DeviceStream& /*streamLocal */,
+ const DeviceStream& /*streamNonLocal */,
int /*pulse */) :
impl_(nullptr)
{
d_x_ = d_coordinatesBuffer;
d_f_ = d_forcesBuffer;
- cudaStream_t stream = nonLocalStream_;
const gmx_domdec_comm_t& comm = *dd_->comm;
const gmx_domdec_comm_dim_t& cd = comm.cd[0];
const gmx_domdec_ind_t& ind = cd.ind[pulse_];
GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
- copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, stream,
+ copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
GpuApiCallBehavior::Async, nullptr);
// This rank will push data to its neighbor, so needs to know
config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = nonLocalStream_;
+ config.stream = nonLocalStream_.stream();
const float3* sendBuf = d_sendBuf_;
const float3* d_x = d_x_;
if (!accumulateForces)
{
// Clear local portion of force array (in local stream)
- cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+ cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
}
// ensure non-local stream waits for local stream, due to dependence on
config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = nonLocalStream_;
+ config.stream = nonLocalStream_.stream();
const float3* recvBuf = d_recvBuf_;
const int* indexMap = d_indexMap_;
int recvRank)
{
- cudaError_t stat;
- cudaStream_t stream = nonLocalStream_;
+ cudaError_t stat;
// We asynchronously push data to remote rank. The remote
// destination pointer has already been set in the init fn. We
if (sendSize > 0)
{
stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
- cudaMemcpyDeviceToDevice, stream);
+ cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
}
// to its stream.
GpuEventSynchronizer* haloDataTransferRemote;
- haloDataTransferLaunched_->markEvent(stream);
+ haloDataTransferLaunched_->markEvent(nonLocalStream_);
MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
&haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
mpi_comm_mysim_, MPI_STATUS_IGNORE);
- haloDataTransferRemote->enqueueWaitEvent(stream);
+ haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
#else
GMX_UNUSED_VALUE(sendRank);
GMX_UNUSED_VALUE(recvRank);
GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- void* localStream,
- void* nonLocalStream,
+ const DeviceStream& localStream,
+ const DeviceStream& nonLocalStream,
int pulse) :
dd_(dd),
sendRankX_(dd->neighbor[0][1]),
haloDataTransferLaunched_(new GpuEventSynchronizer()),
mpi_comm_mysim_(mpi_comm_mysim),
deviceContext_(deviceContext),
- localStream_(*static_cast<cudaStream_t*>(localStream)),
- nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+ localStream_(localStream),
+ nonLocalStream_(nonLocalStream),
pulse_(pulse)
{
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- void* localStream,
- void* nonLocalStream,
+ const DeviceStream& localStream,
+ const DeviceStream& nonLocalStream,
int pulse) :
impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
{
Impl(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- void* localStream,
- void* nonLocalStream,
+ const DeviceStream& localStream,
+ const DeviceStream& nonLocalStream,
int pulse);
~Impl();
//! GPU context object
const DeviceContext& deviceContext_;
//! CUDA stream for local non-bonded calculations
- cudaStream_t localStream_ = nullptr;
+ const DeviceStream& localStream_;
//! CUDA stream for non-local non-bonded calculations
- cudaStream_t nonLocalStream_ = nullptr;
+ const DeviceStream& nonLocalStream_;
//! full coordinates buffer in GPU memory
float3* d_x_ = nullptr;
//! full forces buffer in GPU memory
struct NumPmeDomains;
class DeviceContext;
+class DeviceStream;
enum class GpuTaskCompletion;
class PmeGpuProgram;
class GpuEventSynchronizer;
* \param[in] pme The PME data structure.
* \returns Pointer to GPU stream object.
*/
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/gmxmpi.h"
+class DeviceStream;
struct PpRanks;
namespace gmx
* \param[in] comm Communicator used for simulation
* \param[in] ppRanks List of PP ranks
*/
- PmeCoordinateReceiverGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
~PmeCoordinateReceiverGpu();
/*! \brief
};
/*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
MPI_Comm /* comm */,
gmx::ArrayRef<PpRanks> /* ppRanks */) :
impl_(nullptr)
namespace gmx
{
-PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
- pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream& pmeStream,
+ MPI_Comm comm,
+ gmx::ArrayRef<PpRanks> ppRanks) :
+ pmeStream_(pmeStream),
comm_(comm),
ppRanks_(ppRanks)
{
}
}
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* pmeStream,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& pmeStream,
MPI_Comm comm,
gmx::ArrayRef<PpRanks> ppRanks) :
impl_(new Impl(pmeStream, comm, ppRanks))
* \param[in] comm Communicator used for simulation
* \param[in] ppRanks List of PP ranks
*/
- Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
~Impl();
/*! \brief
private:
//! CUDA stream for PME operations
- cudaStream_t pmeStream_ = nullptr;
+ const DeviceStream& pmeStream_;
//! communicator for simulation
MPI_Comm comm_;
//! list of PP ranks
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/gmxmpi.h"
+class DeviceStream;
+
/*! \libinternal
* \brief Contains information about the PP ranks that partner this PME rank. */
struct PpRanks
* \param[in] comm Communicator used for simulation
* \param[in] ppRanks List of PP ranks
*/
- PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ PmeForceSenderGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
~PmeForceSenderGpu();
/*! \brief
};
/*!\brief Constructor stub. */
-PmeForceSenderGpu::PmeForceSenderGpu(const void* /*pmeStream */,
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
MPI_Comm /* comm */,
gmx::ArrayRef<PpRanks> /* ppRanks */) :
impl_(nullptr)
{
/*! \brief Create PME-PP GPU communication object */
-PmeForceSenderGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
- pmeStream_(*static_cast<const cudaStream_t*>(pmeStream)),
+PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+ pmeStream_(pmeStream),
comm_(comm),
ppRanks_(ppRanks)
{
#endif
}
-PmeForceSenderGpu::PmeForceSenderGpu(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks) :
+PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& pmeStream,
+ MPI_Comm comm,
+ gmx::ArrayRef<PpRanks> ppRanks) :
impl_(new Impl(pmeStream, comm, ppRanks))
{
}
* \param[in] comm Communicator used for simulation
* \param[in] ppRanks List of PP ranks
*/
- Impl(const void* pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
~Impl();
/*! \brief
private:
//! CUDA stream for PME operations
- cudaStream_t pmeStream_ = nullptr;
+ const DeviceStream& pmeStream_;
//! Event triggered when to allow remote PP stream to syn with pme stream
GpuEventSynchronizer pmeSync_;
//! communicator for simulation
pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
}
-void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
+const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
{
if (!pme || !pme_gpu_active(pme))
{
realGridSizePaddedTotal, CUFFT_C2R, batch);
handleCufftError(result, "cufftPlanMany C2R plan failure");
- cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
+ cudaStream_t stream = pmeGpu->archSpecific->pmeStream_.stream();
GMX_RELEASE_ASSERT(stream, "Using the default CUDA stream for PME cuFFT");
result = cufftSetStream(planR2C_, stream);
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#elif GMX_GPU == GMX_GPU_OPENCL
clfftPlanHandle planR2C_;
clfftPlanHandle planC2R_;
- std::vector<cl_command_queue> commandStreams_;
+ std::vector<cl_command_queue> deviceStreams_;
cl_mem realGrid_;
cl_mem complexGrid_;
#endif
"Complex padding not implemented");
}
cl_context context = pmeGpu->archSpecific->deviceContext_.context();
- commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
+ deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream());
realGrid_ = kernelParamsPtr->grid.d_realGrid;
complexGrid_ = kernelParamsPtr->grid.d_fourierGrid;
const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
handleClfftError(clfftSetPlanOutStride(planC2R_, dims, realGridStrides.data()),
"clFFT stride setting failure");
- handleClfftError(clfftBakePlan(planR2C_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+ handleClfftError(clfftBakePlan(planR2C_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
"clFFT precompiling failure");
- handleClfftError(clfftBakePlan(planC2R_, commandStreams_.size(), commandStreams_.data(), nullptr, nullptr),
+ handleClfftError(clfftBakePlan(planC2R_, deviceStreams_.size(), deviceStreams_.data(), nullptr, nullptr),
"clFFT precompiling failure");
// TODO: implement solve kernel as R2C FFT callback
GMX_THROW(
gmx::NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
}
- handleClfftError(clfftEnqueueTransform(plan, direction, commandStreams_.size(),
- commandStreams_.data(), waitEvents.size(), waitEvents.data(),
+ handleClfftError(clfftEnqueueTransform(plan, direction, deviceStreams_.size(),
+ deviceStreams_.data(), waitEvents.size(), waitEvents.data(),
timingEvent, inputGrids, outputGrids, tempBuffer),
"clFFT execution failure");
}
void pme_gpu_synchronize(const PmeGpu* pmeGpu)
{
- gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->pmeStream_.synchronize();
}
void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
{
clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
- c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream);
+ c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
}
void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
}
/* TODO: pin original buffer instead! */
copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
- 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream,
+ 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, h_forcesFloat, 0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+ DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0,
- DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream,
+ DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
pmeGpu->archSpecific->deviceContext_);
copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
if (c_usePadding)
{
const size_t paddingIndex = pmeGpu->kernelParams->atoms.nAtoms;
if (paddingCount > 0)
{
clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
- paddingCount, pmeGpu->archSpecific->pmeStream);
+ paddingCount, pmeGpu->archSpecific->pmeStream_);
}
}
}
void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
{
clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
- pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_);
}
void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
pmeGpu->archSpecific->deviceContext_);
copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
- newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+ newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
GpuApiCallBehavior::Async, nullptr);
copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(), 0,
- newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
+ newFractShiftsSize, pmeGpu->archSpecific->pmeStream_,
GpuApiCallBehavior::Async, nullptr);
#endif
}
bool pme_gpu_stream_query(const PmeGpu* pmeGpu)
{
- return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+ return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_);
}
void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
{
copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
{
copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
- pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream,
+ pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
- pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
}
void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu)
const size_t splinesCount = DIM * nAtomsPadded * pmeGpu->common->pme_order;
auto* kernelParamsPtr = pmeGpu->kernelParams.get();
copyFromDeviceBuffer(pmeGpu->staging.h_dtheta, &kernelParamsPtr->atoms.d_dtheta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
copyFromDeviceBuffer(pmeGpu->staging.h_theta, &kernelParamsPtr->atoms.d_theta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
copyFromDeviceBuffer(pmeGpu->staging.h_gridlineIndices, &kernelParamsPtr->atoms.d_gridlineIndices,
- 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+ 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
{
// TODO: could clear only the padding and not the whole thing, but this is a test-exclusive code anyway
clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_gridlineIndices, 0,
- pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream);
+ pmeGpu->nAtomsAlloc * DIM, pmeGpu->archSpecific->pmeStream_);
clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_dtheta, 0,
pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
- pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->pmeStream_);
clearDeviceBufferAsync(&kernelParamsPtr->atoms.d_theta, 0,
pmeGpu->nAtomsAlloc * pmeGpu->common->pme_order * DIM,
- pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->pmeStream_);
}
copyToDeviceBuffer(&kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
copyToDeviceBuffer(&kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, 0, splinesCount,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
copyToDeviceBuffer(&kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
- 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream,
+ 0, kernelParamsPtr->atoms.nAtoms * DIM, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
int highest_priority, lowest_priority;
stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
- stat = cudaStreamCreateWithPriority(&pmeGpu->archSpecific->pmeStream,
+ cudaStream_t stream;
+ stat = cudaStreamCreateWithPriority(&stream,
cudaStreamDefault, // cudaStreamNonBlocking,
highest_priority);
+ pmeGpu->archSpecific->pmeStream_.setStream(stream);
CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
#elif GMX_GPU == GMX_GPU_OPENCL
cl_command_queue_properties queueProperties =
pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
cl_int clError;
- pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
- pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
- if (clError != CL_SUCCESS)
- {
- GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
- }
-#endif
-}
+ pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue(
+ pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError));
+
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu)
-{
-#if GMX_GPU == GMX_GPU_CUDA
- /* Destroy the CUDA stream */
- cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
- CU_RET_ERR(stat, "PME cudaStreamDestroy error");
-#elif GMX_GPU == GMX_GPU_OPENCL
- cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
if (clError != CL_SUCCESS)
{
- gmx_warning("Failed to destroy PME command queue");
+ GMX_THROW(gmx::InternalError(
+ gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str()));
}
#endif
}
pme_gpu_destroy_3dfft(pmeGpu);
- /* Free the GPU-framework specific data last */
- pme_gpu_destroy_specific(pmeGpu);
-
delete pmeGpu;
}
"Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
if (xReadyOnDevice)
{
- xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+ xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
}
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
- config.stream = pmeGpu->archSpecific->pmeStream;
+ config.stream = pmeGpu->archSpecific->pmeStream_.stream();
int timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (copyInputAndOutputGrid)
{
copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
- pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+ pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
/ gridLinesPerBlock;
config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
- config.stream = pmeGpu->archSpecific->pmeStream;
+ config.stream = pmeGpu->archSpecific->pmeStream_.stream();
int timingId = gtPME_SOLVE;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
{
copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
&kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
- pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
if (copyInputAndOutputGrid)
{
copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
- pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream,
+ pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
pmeGpu->settings.transferKind, nullptr);
}
}
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
- config.stream = pmeGpu->archSpecific->pmeStream;
+ config.stream = pmeGpu->archSpecific->pmeStream_.stream();
// TODO test different cache configs
if (pmeGpu->settings.useGpuForceReduction)
{
- pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream_);
}
else
{
pmeGpu->kernelParams->atoms.d_coordinates = d_x;
}
-void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
+const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
{
if (pmeGpu)
{
- return static_cast<void*>(&pmeGpu->archSpecific->pmeStream);
+ return &pmeGpu->archSpecific->pmeStream_;
}
else
{
*/
void pme_gpu_init_internal(PmeGpu* pmeGpu);
-/*! \libinternal \brief
- * Destroys the PME GPU-framework specific data.
- * Should be called last in the PME GPU destructor.
- *
- * \param[in] pmeGpu The PME GPU structure.
- */
-void pme_gpu_destroy_specific(const PmeGpu* pmeGpu);
-
/*! \libinternal \brief
* Initializes the CUDA FFT structures.
*
* \param[in] pmeGpu The PME GPU structure.
* \returns Pointer to stream object.
*/
-GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
GPU_FUNC_TERM_WITH_RETURN(nullptr);
/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
{
GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
"Wrong PME GPU timing event index");
- pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->timingEvents[PMEStageId].openTimingRegion(pmeGpu->archSpecific->pmeStream_);
}
}
{
GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(),
"Wrong PME GPU timing event index");
- pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream);
+ pmeGpu->archSpecific->timingEvents[PMEStageId].closeTimingRegion(pmeGpu->archSpecific->pmeStream_);
}
}
* \param[in] deviceContext GPU device context.
*/
PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
- /*! \brief The GPU stream where everything related to the PME happens. */
- CommandStream pmeStream;
/*! \brief
* A handle to the GPU context.
*/
const DeviceContext& deviceContext_;
+ /*! \brief The GPU stream where everything related to the PME happens. */
+ DeviceStream pmeStream_;
+
/* Synchronization events */
/*! \brief Triggered after the PME Force Calculations have been completed */
GpuEventSynchronizer pmeForcesReady;
const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
if (useGpuForPme)
{
- const void* commandStream = pme_gpu_get_device_stream(pme);
+ const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
if (c_enableGpuPmePpComms)
{
pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
- commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+ deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
- commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+ deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
}
GMX_RELEASE_ASSERT(
deviceContext != nullptr,
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- commandStream, *deviceContext, GpuApiCallBehavior::Async,
+ &deviceStream, *deviceContext, GpuApiCallBehavior::Async,
pme_gpu_get_padding_size(pme), wcycle);
}
GMX_RELEASE_ASSERT(
GMX_THREAD_MPI,
"PME-PP GPU Communication is currently only supported with thread-MPI enabled");
- cudaStreamCreate(&pmePpCommStream_);
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ pmePpCommStream_.setStream(stream);
}
PmePpCommGpu::Impl::~Impl() = default;
// Pull force data from remote GPU
void* pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
cudaError_t stat = cudaMemcpyAsync(pmeForcePtr, remotePmeFBuffer_, recvSize * DIM * sizeof(float),
- cudaMemcpyDefault, pmePpCommStream_);
+ cudaMemcpyDefault, pmePpCommStream_.stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
if (receivePmeForceToGpu)
{
// Ensure CPU waits for PME forces to be copied before reducing
// them with other forces on the CPU
- cudaStreamSynchronize(pmePpCommStream_);
+ cudaStreamSynchronize(pmePpCommStream_.stream());
}
#else
GMX_UNUSED_VALUE(recvPtr);
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
cudaError_t stat = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr, sendSize * DIM * sizeof(float),
- cudaMemcpyDefault, pmePpCommStream_);
+ cudaMemcpyDefault, pmePpCommStream_.stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
// Record and send event to allow PME task to sync to above transfer before commencing force calculations
//! Device context object
const DeviceContext& deviceContext_;
//! CUDA stream used for the communication operations in this class
- cudaStream_t pmePpCommStream_ = nullptr;
+ DeviceStream pmePpCommStream_;
//! Remote location of PME coordinate data buffer
void* remotePmeXBuffer_ = nullptr;
//! Remote location of PME force data buffer
if(GMX_USE_OPENCL)
gmx_add_libgromacs_sources(
device_context_ocl.cpp
+ device_stream_ocl.cpp
gpu_utils_ocl.cpp
ocl_compiler.cpp
ocl_caching.cpp
elseif(GMX_USE_CUDA)
gmx_add_libgromacs_sources(
cudautils.cu
+ device_stream.cu
gpu_utils.cu
pinning.cu
pmalloc_cuda.cu
)
+elseif()
+ gmx_add_libgromacs_sources(
+ device_stream.cpp
+ )
endif()
if (BUILD_TESTING)
rvec_inc(a, tmp);
}
-/*! \brief Wait for all taks in stream \p s to complete.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cudaStream_t s)
-{
- cudaError_t stat = cudaStreamSynchronize(s);
- CU_RET_ERR(stat, "cudaStreamSynchronize failed");
-}
-
/*! \brief Returns true if all tasks in \p s have completed.
*
- * \param[in] s stream to check
+ * \param[in] deviceStream CUDA stream to check.
*
- * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ * \returns True if all tasks enqueued in the stream \p deviceStream (at the time of this call) have completed.
*/
-static inline bool haveStreamTasksCompleted(cudaStream_t s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& deviceStream)
{
- cudaError_t stat = cudaStreamQuery(s);
+ cudaError_t stat = cudaStreamQuery(deviceStream.stream());
if (stat == cudaErrorNotReady)
{
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+DeviceStream::DeviceStream() = default;
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+ const DeviceContext& /* deviceContext */,
+ DeviceStreamPriority /* priority */,
+ const bool /* useTiming */)
+{
+}
+
+DeviceStream::~DeviceStream() = default;
+
+void DeviceStream::synchronize() const {}
\ No newline at end of file
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_stream.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+ stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
+ const DeviceContext& /* deviceContext */,
+ DeviceStreamPriority priority,
+ const bool /* useTiming */)
+{
+ cudaError_t stat;
+
+ if (priority == DeviceStreamPriority::Normal)
+ {
+ stat = cudaStreamCreate(&stream_);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError(gmx::formatString(
+ "Could not create CUDA stream (CUDA error %d: %s).", stat, cudaGetErrorString(stat))));
+ }
+ }
+ else if (priority == DeviceStreamPriority::High)
+ {
+ // Note that the device we're running on does not have to
+ // support priorities, because we are querying the priority
+ // range, which in that case will be a single value.
+ int highestPriority;
+ stat = cudaDeviceGetStreamPriorityRange(nullptr, &highestPriority);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError(gmx::formatString(
+ "Could not query CUDA stream priority range (CUDA error %d: %s).", stat,
+ cudaGetErrorString(stat))));
+ }
+
+ stat = cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, highestPriority);
+ if (stat != cudaSuccess)
+ {
+ GMX_THROW(gmx::InternalError(gmx::formatString(
+ "Could not create CUDA stream with high priority (CUDA error %d: %s).", stat,
+ cudaGetErrorString(stat))));
+ }
+ }
+}
+
+DeviceStream::~DeviceStream()
+{
+ if (stream_)
+ {
+ cudaError_t stat = cudaStreamDestroy(stream_);
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ gmx::formatString("Failed to release CUDA stream (CUDA error %d: %s).",
+ stat, cudaGetErrorString(stat))
+ .c_str());
+ stream_ = nullptr;
+ }
+}
+
+cudaStream_t DeviceStream::stream() const
+{
+ return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+ cudaError_t stat = cudaStreamSynchronize(stream_);
+ GMX_RELEASE_ASSERT(stat == cudaSuccess,
+ gmx::formatString("cudaStreamSynchronize failed (CUDA error %d: %s).", stat,
+ cudaGetErrorString(stat))
+ .c_str());
+}
\ No newline at end of file
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_STREAM_H
+#define GMX_GPU_UTILS_DEVICE_STREAM_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceStream class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+# include "gromacs/gpu_utils/gmxopencl.h"
+#endif
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+class DeviceContext;
+
+//! Enumeration describing the priority with which a stream operates.
+enum class DeviceStreamPriority : int
+{
+ //! High-priority stream
+ High,
+ //! Normal-priority stream
+ Normal,
+ //! Conventional termination of the enumeration
+ Count
+};
+
+// Stub for device context
+class DeviceStream
+{
+public:
+ //! Default constructor
+ DeviceStream();
+ //! Destructor
+ ~DeviceStream();
+
+ /*! \brief Initialize
+ *
+ * \param[in] deviceInfo Platform-specific device information (only used in OpenCL).
+ * \param[in] deviceContext Device context (not used in CUDA).
+ * \param[in] priority Stream priority: high or normal.
+ * \param[in] useTiming If the timing should be enabled (not used in CUDA).
+ */
+ void init(const DeviceInformation& deviceInfo,
+ const DeviceContext& deviceContext,
+ DeviceStreamPriority priority,
+ const bool useTiming);
+
+ /*! \brief Construct and init.
+ *
+ * \param[in] deviceInfo Platform-specific device information (only used in OpenCL).
+ * \param[in] deviceContext Device context (only used in OpenCL).
+ * \param[in] priority Stream priority: high or normal (only used in CUDA).
+ * \param[in] useTiming If the timing should be enabled (only used in OpenCL).
+ */
+ DeviceStream(const DeviceInformation& deviceInfo,
+ const DeviceContext& deviceContext,
+ DeviceStreamPriority priority,
+ const bool useTiming)
+ {
+ init(deviceInfo, deviceContext, priority, useTiming);
+ }
+
+ //! Synchronize the steam
+ void synchronize() const;
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+ //! Getter
+ cudaStream_t stream() const;
+ //! Setter (temporary, will be removed in the follow-up)
+ void setStream(cudaStream_t stream) { stream_ = stream; }
+
+private:
+ cudaStream_t stream_ = nullptr;
+
+#elif GMX_GPU == GMX_GPU_OPENCL
+
+ //! Getter
+ cl_command_queue stream() const;
+ //! Setter (temporary, will be removed in the follow-up)
+ void setStream(cl_command_queue stream) { stream_ = stream; }
+
+private:
+ cl_command_queue stream_ = nullptr;
+
+#endif
+
+ GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceStream);
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_STREAM_H
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceStream for OpenCL.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/device_context_ocl.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+DeviceStream::DeviceStream()
+{
+ stream_ = nullptr;
+}
+
+void DeviceStream::init(const DeviceInformation& deviceInfo,
+ const DeviceContext& deviceContext,
+ DeviceStreamPriority /* priority */,
+ const bool useTiming)
+{
+ cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
+ cl_device_id deviceId = deviceInfo.oclDeviceId;
+ cl_int clError;
+ stream_ = clCreateCommandQueue(deviceContext.context(), deviceId, queueProperties, &clError);
+ if (clError != CL_SUCCESS)
+ {
+ GMX_THROW(gmx::InternalError(gmx::formatString(
+ "Failed to create OpenCL command queue on GPU %s (OpenCL error ID %d).",
+ deviceInfo.device_name, clError)));
+ }
+}
+
+DeviceStream::~DeviceStream()
+{
+ if (stream_)
+ {
+ cl_int clError = clReleaseCommandQueue(stream_);
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Failed to release OpenCL stream (OpenCL error ID %d).", clError).c_str());
+ stream_ = nullptr;
+ }
+}
+
+cl_command_queue DeviceStream::stream() const
+{
+ return stream_;
+}
+
+void DeviceStream::synchronize() const
+{
+ cl_int clError = clFinish(stream_);
+ GMX_RELEASE_ASSERT(
+ CL_SUCCESS == clError,
+ gmx::formatString("Error caught during clFinish (OpenCL error ID %d).", clError).c_str());
+}
\ No newline at end of file
* \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType
* \param[in] startingOffset Offset (in values) at the device-side buffer to copy into.
* \param[in] numValues Number of values to copy.
- * \param[in] stream GPU stream to perform asynchronous copy in.
+ * \param[in] deviceStream GPU stream to perform asynchronous copy in.
* \param[in] transferKind Copy type: synchronous or asynchronous.
* \param[out] timingEvent A dummy pointer to the H2D copy timing event to be filled in.
* Not used in CUDA implementation.
const ValueType* hostBuffer,
size_t startingOffset,
size_t numValues,
- CommandStream stream,
+ const DeviceStream& deviceStream,
GpuApiCallBehavior transferKind,
CommandEvent* /*timingEvent*/)
{
GMX_ASSERT(isHostMemoryPinned(hostBuffer),
"Source host buffer was not pinned for CUDA");
stat = cudaMemcpyAsync(*((ValueType**)buffer) + startingOffset, hostBuffer, bytes,
- cudaMemcpyHostToDevice, stream);
+ cudaMemcpyHostToDevice, deviceStream.stream());
GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous H2D copy failed");
break;
* \param[in] buffer Pointer to the device-side buffer
* \param[in] startingOffset Offset (in values) at the device-side buffer to copy from.
* \param[in] numValues Number of values to copy.
- * \param[in] stream GPU stream to perform asynchronous copy in.
+ * \param[in] deviceStream GPU stream to perform asynchronous copy in.
* \param[in] transferKind Copy type: synchronous or asynchronous.
* \param[out] timingEvent A dummy pointer to the H2D copy timing event to be filled in.
* Not used in CUDA implementation.
DeviceBuffer<ValueType>* buffer,
size_t startingOffset,
size_t numValues,
- CommandStream stream,
+ const DeviceStream& deviceStream,
GpuApiCallBehavior transferKind,
CommandEvent* /*timingEvent*/)
{
GMX_ASSERT(isHostMemoryPinned(hostBuffer),
"Destination host buffer was not pinned for CUDA");
stat = cudaMemcpyAsync(hostBuffer, *((ValueType**)buffer) + startingOffset, bytes,
- cudaMemcpyDeviceToHost, stream);
+ cudaMemcpyDeviceToHost, deviceStream.stream());
GMX_RELEASE_ASSERT(stat == cudaSuccess, "Asynchronous D2H copy failed");
break;
* \param[in,out] buffer Pointer to the device-side buffer
* \param[in] startingOffset Offset (in values) at the device-side buffer to start clearing at.
* \param[in] numValues Number of values to clear.
- * \param[in] stream GPU stream.
+ * \param[in] deviceStream GPU stream.
*/
template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+ size_t startingOffset,
+ size_t numValues,
+ const DeviceStream& deviceStream)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
const size_t bytes = numValues * sizeof(ValueType);
const char pattern = 0;
- cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes, stream);
+ cudaError_t stat = cudaMemsetAsync(*((ValueType**)buffer) + startingOffset, pattern, bytes,
+ deviceStream.stream());
GMX_RELEASE_ASSERT(stat == cudaSuccess, "Couldn't clear the device buffer");
}
* \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType
* \param[in] startingOffset Offset (in values) at the device-side buffer to copy into.
* \param[in] numValues Number of values to copy.
- * \param[in] stream GPU stream to perform asynchronous copy in.
+ * \param[in] deviceStream GPU stream to perform asynchronous copy in.
* \param[in] transferKind Copy type: synchronous or asynchronous.
* \param[out] timingEvent A pointer to the H2D copy timing event to be filled in.
* If the pointer is not null, the event can further be used
const ValueType* hostBuffer,
size_t startingOffset,
size_t numValues,
- CommandStream stream,
+ const DeviceStream& deviceStream,
GpuApiCallBehavior transferKind,
CommandEvent* timingEvent)
{
switch (transferKind)
{
case GpuApiCallBehavior::Async:
- clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
- nullptr, timingEvent);
+ clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+ hostBuffer, 0, nullptr, timingEvent);
GMX_RELEASE_ASSERT(
clError == CL_SUCCESS,
gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError,
break;
case GpuApiCallBehavior::Sync:
- clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
- nullptr, timingEvent);
+ clError = clEnqueueWriteBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+ hostBuffer, 0, nullptr, timingEvent);
GMX_RELEASE_ASSERT(
clError == CL_SUCCESS,
gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError,
* \param[in] buffer Pointer to the device-side buffer
* \param[in] startingOffset Offset (in values) at the device-side buffer to copy from.
* \param[in] numValues Number of values to copy.
- * \param[in] stream GPU stream to perform asynchronous copy in.
+ * \param[in] deviceStream GPU stream to perform asynchronous copy in.
* \param[in] transferKind Copy type: synchronous or asynchronous.
* \param[out] timingEvent A pointer to the H2D copy timing event to be filled in.
* If the pointer is not null, the event can further be used
DeviceBuffer<ValueType>* buffer,
size_t startingOffset,
size_t numValues,
- CommandStream stream,
+ const DeviceStream& deviceStream,
GpuApiCallBehavior transferKind,
CommandEvent* timingEvent)
{
switch (transferKind)
{
case GpuApiCallBehavior::Async:
- clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
- nullptr, timingEvent);
+ clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_FALSE, offset, bytes,
+ hostBuffer, 0, nullptr, timingEvent);
GMX_RELEASE_ASSERT(
clError == CL_SUCCESS,
gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError,
break;
case GpuApiCallBehavior::Sync:
- clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
- nullptr, timingEvent);
+ clError = clEnqueueReadBuffer(deviceStream.stream(), *buffer, CL_TRUE, offset, bytes,
+ hostBuffer, 0, nullptr, timingEvent);
GMX_RELEASE_ASSERT(
clError == CL_SUCCESS,
gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError,
* \param[in,out] buffer Pointer to the device-side buffer
* \param[in] startingOffset Offset (in values) at the device-side buffer to start clearing at.
* \param[in] numValues Number of values to clear.
- * \param[in] stream GPU stream.
+ * \param[in] deviceStream GPU stream.
*/
template<typename ValueType>
-void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffset, size_t numValues, CommandStream stream)
+void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
+ size_t startingOffset,
+ size_t numValues,
+ const DeviceStream& deviceStream)
{
GMX_ASSERT(buffer, "needs a buffer pointer");
const size_t offset = startingOffset * sizeof(ValueType);
const cl_uint numWaitEvents = 0;
const cl_event* waitEvents = nullptr;
cl_event commandEvent;
- cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes,
- numWaitEvents, waitEvents, &commandEvent);
+ cl_int clError = clEnqueueFillBuffer(deviceStream.stream(), *buffer, &pattern, sizeof(pattern),
+ offset, bytes, numWaitEvents, waitEvents, &commandEvent);
GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)",
clError, ocl_get_error_string(clError).c_str())
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \brief Marks the synchronization point in the \p stream.
* Should be followed by waitForEvent().
*/
- inline void markEvent(CommandStream stream)
+ inline void markEvent(const DeviceStream& deviceStream)
{
- cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, stream);
+ cudaError_t gmx_used_in_debug stat = cudaEventRecord(event_, deviceStream.stream());
GMX_ASSERT(stat == cudaSuccess, "cudaEventRecord failed");
}
/*! \brief Synchronizes the host thread on the marked event. */
GMX_ASSERT(stat == cudaSuccess, "cudaEventSynchronize failed");
}
/*! \brief Enqueues a wait for the recorded event in stream \p stream */
- inline void enqueueWaitEvent(CommandStream stream)
+ inline void enqueueWaitEvent(const DeviceStream& deviceStream)
{
- cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(stream, event_, 0);
+ cudaError_t gmx_used_in_debug stat = cudaStreamWaitEvent(deviceStream.stream(), event_, 0);
GMX_ASSERT(stat == cudaSuccess, "cudaStreamWaitEvent failed");
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \brief Marks the synchronization point in the \p stream.
* Should be called first and then followed by waitForEvent().
*/
- inline void markEvent(CommandStream stream)
+ inline void markEvent(const DeviceStream& deviceStream)
{
GMX_ASSERT(nullptr == event_, "Do not call markEvent more than once!");
- cl_int clError = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &event_);
+ cl_int clError = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &event_);
if (CL_SUCCESS != clError)
{
GMX_THROW(gmx::InternalError("Failed to enqueue the GPU synchronization event: "
* After enqueue, the associated event is released, so this method should
* be only called once per markEvent() call.
*/
- inline void enqueueWaitEvent(CommandStream stream)
+ inline void enqueueWaitEvent(const DeviceStream& deviceStream)
{
- cl_int clError = clEnqueueBarrierWithWaitList(stream, 1, &event_, nullptr);
+ cl_int clError = clEnqueueBarrierWithWaitList(deviceStream.stream(), 1, &event_, nullptr);
if (CL_SUCCESS != clError)
{
GMX_THROW(gmx::InternalError("Failed to enqueue device barrier for the GPU event: "
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
/*! \brief Will be called before the region start. */
- inline void openTimingRegion(CommandStream s)
+ inline void openTimingRegion(const DeviceStream& deviceStream)
{
- CU_RET_ERR(cudaEventRecord(eventStart_, s), "GPU timing recording failure");
+ CU_RET_ERR(cudaEventRecord(eventStart_, deviceStream.stream()),
+ "GPU timing recording failure");
}
/*! \brief Will be called after the region end. */
- inline void closeTimingRegion(CommandStream s)
+ inline void closeTimingRegion(const DeviceStream& deviceStream)
{
- CU_RET_ERR(cudaEventRecord(eventStop_, s), "GPU timing recording failure");
+ CU_RET_ERR(cudaEventRecord(eventStop_, deviceStream.stream()),
+ "GPU timing recording failure");
}
/*! \brief Returns the last measured region timespan (in milliseconds) and calls reset() */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \brief
* To be called before the region start.
*
- * \param[in] s The GPU command stream where the event being measured takes place.
+ * \param[in] deviceStream The GPU command stream where the event being measured takes place.
*/
- void openTimingRegion(CommandStream s)
+ void openTimingRegion(const DeviceStream& deviceStream)
{
if (c_debugTimerState)
{
GMX_ASSERT(debugState_ == TimerState::Idle, error.c_str());
debugState_ = TimerState::Recording;
}
- impl_.openTimingRegion(s);
+ impl_.openTimingRegion(deviceStream);
}
/*! \brief
* To be called after the region end.
*
- * \param[in] s The GPU command stream where the event being measured takes place.
+ * \param[in] deviceStream The GPU command stream where the event being measured takes place.
*/
- void closeTimingRegion(CommandStream s)
+ void closeTimingRegion(const DeviceStream& deviceStream)
{
if (c_debugTimerState)
{
debugState_ = TimerState::Stopped;
}
callCount_++;
- impl_.closeTimingRegion(s);
+ impl_.closeTimingRegion(deviceStream);
}
/*! \brief
* Accumulates the last timespan of all the events used into the total duration,
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
GpuRegionTimerImpl(GpuRegionTimerImpl&&) = delete;
/*! \brief Should be called before the region start. */
- inline void openTimingRegion(CommandStream /*unused*/) {}
+ inline void openTimingRegion(const DeviceStream& /*unused*/) {}
/*! \brief Should be called after the region end. */
- inline void closeTimingRegion(CommandStream /*unused*/) {}
+ inline void closeTimingRegion(const DeviceStream& /*unused*/) {}
/*! \brief Returns the last measured region timespan (in milliseconds) and calls reset(). */
inline double getLastRangeTime()
{
* \ingroup module_gpu_utils
*/
+#include "gromacs/gpu_utils/device_stream.h"
+
/*! \brief CUDA device information.
*
* The CUDA device information is queried and set at detection and contains
int stat;
};
-//! \brief GPU command stream
-using CommandStream = cudaStream_t;
//! \brief Single GPU call timing event - meaningless in CUDA
using CommandEvent = void;
*/
struct KernelLaunchConfig
{
- size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts
- size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts
- size_t sharedMemorySize = 0; //!< Shared memory size in bytes
- CommandStream stream = nullptr; //!< Stream to launch kernel in
+ size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts
+ size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts
+ size_t sharedMemorySize = 0; //!< Shared memory size in bytes
+ cudaStream_t stream = nullptr; //!< Stream to launch kernel in
};
//! Sets whether device code can use arrays that are embedded in structs.
// No member needed
};
-//! \brief GPU command stream
-using CommandStream = void*;
//! \brief Single GPU call timing event
using CommandEvent = void*;
size_t maxWorkGroupSize; //!< Workgroup total size limit (CL_DEVICE_MAX_WORK_GROUP_SIZE).
};
-//! \brief GPU command stream
-using CommandStream = cl_command_queue;
//! \brief Single GPU call timing event
using CommandEvent = cl_event;
*/
struct KernelLaunchConfig
{
- size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
- size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
- size_t sharedMemorySize = 0; //!< Shared memory size in bytes
- CommandStream stream = nullptr; //!< Stream to launch kernel in
+ size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
+ size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
+ size_t sharedMemorySize = 0; //!< Shared memory size in bytes
+ cl_command_queue stream = nullptr; //!< Stream to launch kernel in
};
/*! \brief Sets whether device code can use arrays that are embedded in structs.
#include <string>
#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/gmxopencl.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/utility/exceptions.h"
/*! \brief Convert error code to diagnostic string */
std::string ocl_get_error_string(cl_int error);
-/*! \brief Calls clFinish() in the stream \p s.
- *
- * \param[in] s stream to synchronize with
- */
-static inline void gpuStreamSynchronize(cl_command_queue s)
-{
- cl_int cl_error = clFinish(s);
- GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error,
- ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
-}
-
//! A debug checker to track cl_events being released correctly
inline void ensureReferenceCount(const cl_event& event, unsigned int refCount)
{
/*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
*
- * \param[in] s queue to check
- *
- * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ * \returns Not implemented in OpenCL.
*/
-static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */)
{
GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
return false;
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
+ const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
const int numElements = h_rVecInput.size();
DeviceBuffer<RVec> d_rVecInput;
allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
- copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
+ copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
DeviceBuffer<float3> d_float3Output;
kernelLaunchConfig.blockSize[1] = 1;
kernelLaunchConfig.blockSize[2] = 1;
kernelLaunchConfig.sharedMemorySize = 0;
- kernelLaunchConfig.stream = nullptr;
+ kernelLaunchConfig.stream = deviceStream.stream();
auto kernelPtr = convertRVecToFloat3OnDevice_kernel;
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig,
&d_float3Output, &d_rVecInput, &numElements);
launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs);
- copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, nullptr,
+ copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
saveFloat3InRVecFormat(h_rVecOutput, h_float3Output.data(), numElements);
#include "gromacs/utility/classhelpers.h"
class DeviceContext;
+class DeviceStream;
+
struct gmx_enerdata_t;
struct gmx_ffparams_t;
struct gmx_mtop_t;
//! Construct the manager with constant data and the stream to use.
GpuBonded(const gmx_ffparams_t& ffparams,
const DeviceContext& deviceContext,
- void* streamPtr,
+ const DeviceStream& deviceStream,
gmx_wallcycle* wcycle);
//! Destructor
~GpuBonded();
GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */,
const DeviceContext& /* deviceContext */,
- void* /*streamPtr */,
+ const DeviceStream& /* deviceStream */,
gmx_wallcycle* /* wcycle */) :
impl_(nullptr)
{
GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
const DeviceContext& deviceContext,
- void* streamPtr,
+ const DeviceStream& deviceStream,
gmx_wallcycle* wcycle) :
- deviceContext_(deviceContext)
+ deviceContext_(deviceContext),
+ deviceStream_(deviceStream)
{
- stream_ = *static_cast<CommandStream*>(streamPtr);
wcycle_ = wcycle;
allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
// This could be an async transfer (if the source is pinned), so
// long as it uses the same stream as the kernels and we are happy
// to consume additional pinned pages.
- copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
- GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(),
+ deviceStream_, GpuApiCallBehavior::Sync, nullptr);
vTot_.resize(F_NRE);
allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
- clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+ clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
kernelParams_.d_forceParams = d_forceParams_;
kernelParams_.d_xq = d_xq_;
reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
deviceContext_);
- copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
+ copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), deviceStream_,
GpuApiCallBehavior::Async, nullptr);
}
kernelParams_.fTypesOnGpu[fTypesCounter] = fType;
wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
// TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
float* h_vTot = vTot_.data();
- copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, stream_, GpuApiCallBehavior::Async, nullptr);
+ copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
}
"accumulation should not occur");
wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
- cudaError_t stat = cudaStreamSynchronize(stream_);
+ cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
{
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
- clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
+ clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams,
const DeviceContext& deviceContext,
- void* streamPtr,
+ const DeviceStream& deviceStream,
gmx_wallcycle* wcycle) :
- impl_(new Impl(ffparams, deviceContext, streamPtr, wcycle))
+ impl_(new Impl(ffparams, deviceContext, deviceStream, wcycle))
{
}
{
public:
//! Constructor
- Impl(const gmx_ffparams_t& ffparams, const DeviceContext& deviceContext, void* streamPtr, gmx_wallcycle* wcycle);
+ Impl(const gmx_ffparams_t& ffparams,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle);
/*! \brief Destructor, non-default needed for freeing
* device-side buffers */
~Impl();
//! GPU context object
const DeviceContext& deviceContext_;
//! \brief Bonded GPU stream, not owned by this module
- CommandStream stream_;
+ const DeviceStream& deviceStream_;
//! Parameters and pointers, passed to the CUDA kernel
BondedCudaKernelParameters kernelParams_;
config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
config.gridSize[1] = 1;
config.gridSize[2] = 1;
- config.stream = stream_;
+ config.stream = deviceStream_.stream();
auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
h_lambdas_[i] = tcstat[i].lambda;
}
copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
- commandStream_, GpuApiCallBehavior::Async, nullptr);
+ deviceStream_, GpuApiCallBehavior::Async, nullptr);
}
VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
if (doParrinelloRahman)
return;
}
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
deviceContext_(deviceContext),
- commandStream_(commandStream)
+ deviceStream_(deviceStream)
{
numAtoms_ = 0;
kernelLaunchConfig_.blockSize[1] = 1;
kernelLaunchConfig_.blockSize[2] = 1;
kernelLaunchConfig_.sharedMemorySize = 0;
- kernelLaunchConfig_.stream = commandStream_;
+ kernelLaunchConfig_.stream = deviceStream_.stream();
}
LeapFrogGpu::~LeapFrogGpu()
reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
&numInverseMassesAlloc_, deviceContext_);
- copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
+ copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
// Temperature scale group map only used if there are more then one group
{
reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
&numTempScaleGroupsAlloc_, deviceContext_);
- copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
+ copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
}
/*! \brief Constructor.
*
* \param[in] deviceContext Device context (dummy in CUDA).
- * \param[in] commandStream Device command stream to use.
+ * \param[in] deviceStream Device stream to use.
*/
- LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
+ LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
~LeapFrogGpu();
/*! \brief Integrate
//! GPU context object
const DeviceContext& deviceContext_;
//! GPU stream
- CommandStream commandStream_;
+ const DeviceStream& deviceStream_;
//! GPU kernel launch config
KernelLaunchConfig kernelLaunchConfig_;
//! Number of atoms
{
// Fill with zeros so the values can be reduced to it
// Only 6 values are needed because virial is symmetrical
- clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_);
+ clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
}
auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
{
config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
}
- config.stream = commandStream_;
+ config.stream = deviceStream_.stream();
kernelParams_.pbcAiuc = pbcAiuc;
{
// Copy LINCS virial data and add it to the common virial
copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
- commandStream_, GpuApiCallBehavior::Sync, nullptr);
+ deviceStream_, GpuApiCallBehavior::Sync, nullptr);
// Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
virialScaled[XX][XX] += h_virialScaled_[0];
LincsGpu::LincsGpu(int numIterations,
int expansionOrder,
const DeviceContext& deviceContext,
- CommandStream commandStream) :
+ const DeviceStream& deviceStream) :
deviceContext_(deviceContext),
- commandStream_(commandStream)
+ deviceStream_(deviceStream)
{
kernelParams_.numIterations = numIterations;
kernelParams_.expansionOrder = expansionOrder;
// Copy data to GPU.
copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
- kernelParams_.numConstraintsThreads, commandStream_,
- GpuApiCallBehavior::Sync, nullptr);
+ kernelParams_.numConstraintsThreads, deviceStream_, GpuApiCallBehavior::Sync,
+ nullptr);
copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
- commandStream_, GpuApiCallBehavior::Sync, nullptr);
+ deviceStream_, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
- commandStream_, GpuApiCallBehavior::Sync, nullptr);
+ deviceStream_, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
- commandStream_, GpuApiCallBehavior::Sync, nullptr);
+ deviceStream_, GpuApiCallBehavior::Sync, nullptr);
copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
- maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
+ maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
- copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
+ copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
}
* \param[in] numIterations Number of iteration for the correction of the projection.
* \param[in] expansionOrder Order of the matrix inversion algorithm.
* \param[in] deviceContext Device context (dummy in CUDA).
- * \param[in] commandStream Device command stream.
+ * \param[in] deviceStream Device command stream.
*/
- LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
+ LincsGpu(int numIterations,
+ int expansionOrder,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream);
/*! \brief Destructor.*/
~LincsGpu();
//! GPU context object
const DeviceContext& deviceContext_;
//! GPU stream
- CommandStream commandStream_;
+ const DeviceStream& deviceStream_;
//! Parameters and pointers, passed to the GPU kernel
LincsGpuKernelParameters kernelParams_;
{
// Fill with zeros so the values can be reduced to it
// Only 6 values are needed because virial is symmetrical
- clearDeviceBufferAsync(&d_virialScaled_, 0, 6, commandStream_);
+ clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_);
}
auto kernelPtr = getSettleKernelPtr(updateVelocities, computeVirial);
{
config.sharedMemorySize = 0;
}
- config.stream = commandStream_;
+ config.stream = deviceStream_.stream();
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_,
&settleParameters_, &d_x, &d_xp, &invdt, &d_v,
if (computeVirial)
{
- copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, commandStream_,
+ copyFromDeviceBuffer(h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
// Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
return;
}
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
deviceContext_(deviceContext),
- commandStream_(commandStream)
+ deviceStream_(deviceStream)
{
static_assert(sizeof(real) == sizeof(float),
"Real numbers should be in single precision in GPU code.");
settler.z = iatoms[i * nral1 + 3]; // Second hydrogen index
h_atomIds_.at(i) = settler;
}
- copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, commandStream_,
+ copyToDeviceBuffer(&d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_,
GpuApiCallBehavior::Sync, nullptr);
}
* target O-H and H-H distances. These values are also checked for
* consistency.
* \param[in] deviceContext Device context (dummy in CUDA).
- * \param[in] commandStream Device stream to use.
+ * \param[in] deviceStream Device stream to use.
*/
- SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
+ SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
~SettleGpu();
//! GPU context object
const DeviceContext& deviceContext_;
//! GPU stream
- CommandStream commandStream_;
+ const DeviceStream& deviceStream_;
//! Scaled virial tensor (9 reals, GPU)
std::vector<float> h_virialScaled_;
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
+ const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
- deviceContext, nullptr);
+ deviceContext, deviceStream);
bool updateVelocities = true;
int numAtoms = testData->numAtoms_;
allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
- copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
+ copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, nullptr,
+ copyToDeviceBuffer(&d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
if (updateVelocities)
{
- copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, nullptr,
+ copyToDeviceBuffer(&d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
}
lincsGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_,
testData->virialScaled_, pbcAiuc);
- copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, nullptr,
+ copyFromDeviceBuffer((float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
if (updateVelocities)
{
- copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, nullptr,
+ copyFromDeviceBuffer((float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
}
{
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
+ const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
int numAtoms = testData->numAtoms_;
allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
- copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
- auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
+ auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, deviceStream);
integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
testData->dtPressureCouple_, testData->velocityScalingMatrix_);
}
- copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+ copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
freeDeviceBuffer(&d_x);
freeDeviceBuffer(&d_xp);
DeviceInformation deviceInfo;
const DeviceContext deviceContext(deviceInfo);
+ const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
- auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
+ auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, deviceStream);
settleGpu->set(*testData->idef_, testData->mdatoms_);
PbcAiuc pbcAiuc;
allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
- copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
if (updateVelocities)
{
- copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
}
settleGpu->apply(d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial,
testData->virial_, pbcAiuc);
- copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr);
if (updateVelocities)
{
- copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyFromDeviceBuffer((float3*)h_v, &d_v, 0, numAtoms, deviceStream,
+ GpuApiCallBehavior::Sync, nullptr);
}
freeDeviceBuffer(&d_x);
#include "gromacs/utility/classhelpers.h"
class DeviceContext;
+class DeviceStream;
class GpuEventSynchronizer;
-
struct gmx_mtop_t;
enum class PbcType : int;
class InteractionDefinitions;
public:
/*! \brief Create Update-Constrain object.
*
- * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+ * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
* routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
* coordinates. The event is maintained outside this class and also passed to all (if any) consumers
* of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
* \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms
* and target O-H and H-H distances from this object.
* \param[in] deviceContext GPU device context.
- * \param[in] commandStream GPU stream to use. Can be nullptr.
+ * \param[in] deviceStream GPU stream to use.
* \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU.
*/
UpdateConstrainGpu(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
- const void* commandStream,
+ const DeviceStream& deviceStream,
GpuEventSynchronizer* xUpdatedOnDevice);
~UpdateConstrainGpu();
UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir */,
const gmx_mtop_t& /* mtop */,
const DeviceContext& /* deviceContext */,
- const void* /* commandStream */,
+ const DeviceStream& /* deviceStream */,
GpuEventSynchronizer* /* xUpdatedOnDevice */) :
impl_(nullptr)
{
clear_mat(virial);
// Make sure that the forces are ready on device before proceeding with the update.
- fReadyOnDevice->enqueueWaitEvent(commandStream_);
+ fReadyOnDevice->enqueueWaitEvent(deviceStream_);
// The integrate should save a copy of the current coordinates in d_xp_ and write updated once
// into d_x_. The d_xp_ is only needed by constraints.
}
}
- coordinatesReady_->markEvent(commandStream_);
+ coordinatesReady_->markEvent(deviceStream_);
return;
}
"scaleCoordinates_kernel", kernelArgs);
// TODO: Although this only happens on the pressure coupling steps, this synchronization
// can affect the perfornamce if nstpcouple is small.
- gpuStreamSynchronize(commandStream_);
+ deviceStream_.synchronize();
}
UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
- const void* commandStream,
+ const DeviceStream& deviceStream,
GpuEventSynchronizer* xUpdatedOnDevice) :
deviceContext_(deviceContext),
+ deviceStream_(deviceStream),
coordinatesReady_(xUpdatedOnDevice)
{
GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
- commandStream != nullptr ? commandStream_ = *static_cast<const CommandStream*>(commandStream)
- : commandStream_ = nullptr;
- integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
- lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
- settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
+ integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, deviceStream_);
+ lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_);
+ settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, deviceStream_);
coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
coordinateScalingKernelLaunchConfig_.blockSize[2] = 1;
coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
- coordinateScalingKernelLaunchConfig_.stream = commandStream_;
+ coordinateScalingKernelLaunchConfig_.stream = deviceStream_.stream();
}
UpdateConstrainGpu::Impl::~Impl() {}
UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
- const void* commandStream,
+ const DeviceStream& deviceStream,
GpuEventSynchronizer* xUpdatedOnDevice) :
- impl_(new Impl(ir, mtop, deviceContext, commandStream, xUpdatedOnDevice))
+ impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice))
{
}
public:
/*! \brief Create Update-Constrain object.
*
- * The constructor is given a non-nullptr \p commandStream, in which all the update and constrain
+ * The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
* routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
* coordinates. The event is maintained outside this class and also passed to all (if any) consumers
* of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
* \param[in] mtop Topology of the system: SETTLE gets the masses for O and H atoms
* and target O-H and H-H distances from this object.
* \param[in] deviceContext GPU device context.
- * \param[in] commandStream GPU stream to use. Can be nullptr.
+ * \param[in] deviceStream GPU stream to use.
* \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU.
*/
Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
- const void* commandStream,
+ const DeviceStream& deviceStream,
GpuEventSynchronizer* xUpdatedOnDevice);
~Impl();
//! GPU context object
const DeviceContext& deviceContext_;
//! GPU stream
- CommandStream commandStream_ = nullptr;
+ const DeviceStream& deviceStream_;
//! GPU kernel launch config
KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
"GPU device context should be initialized to use GPU update.");
-
+ GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
+ "Update stream can not be nullptr when update is on a GPU.");
integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
- stateGpu->getUpdateStream(),
+ *stateGpu->getUpdateStream(),
stateGpu->xUpdatedOnDevice());
integrator->setPbc(PbcType::Xyz, state->box);
&& useGpuForNonbonded && is1D(*cr->dd))
{
// TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
- void* streamLocal =
+ const DeviceStream* localStream =
Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
- void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+ const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
GMX_RELEASE_ASSERT(
fr->deviceContext != nullptr,
"GPU device context should be initialized to use GPU halo exchange.");
- constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, streamLocal, streamNonLocal);
+ GMX_RELEASE_ASSERT(localStream != nullptr,
+ "Local non-bonded stream can't be nullptr when using GPU "
+ "halo exchange.");
+ GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
+ "Non-local non-bonded stream can't be nullptr when using "
+ "GPU halo exchange.");
+ constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
}
}
}
GMX_RELEASE_ASSERT(
fr->deviceContext != nullptr,
"Device context can not be nullptr when computing bonded interactions on GPU.");
- gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, stream, wcycle);
+ GMX_RELEASE_ASSERT(stream != nullptr,
+ "Can'r run GPU version of bonded forces in nullptr stream.");
+ gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
fr->gpuBonded = gpuBonded.get();
}
&& ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
|| runScheduleWork.simulationWork.useGpuBufferOps))
{
- const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
- const void* localStream =
+ const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
+ const DeviceStream* localStream =
fr->nbv->gpu_nbv != nullptr
? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
: nullptr;
- const void* nonLocalStream =
+ const DeviceStream* nonLocalStream =
fr->nbv->gpu_nbv != nullptr
? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
: nullptr;
#include "locality.h"
class DeviceContext;
+class DeviceStream;
class GpuEventSynchronizer;
struct gmx_wallcycle;
* \p pmeStream argument needs to be a valid OpenCL queue object
* which must have been created in \p deviceContext.
*
- * \todo Make a \p CommandStream visible in the CPU parts of the code so we
- * will not have to pass a void*.
- *
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
* \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
+ StatePropagatorDataGpu(const DeviceStream* pmeStream,
+ const DeviceStream* localStream,
+ const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- StatePropagatorDataGpu(const void* pmeStream,
+ StatePropagatorDataGpu(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
*
* \returns The device command stream to use in update-constraints.
*/
- void* getUpdateStream();
+ const DeviceStream* getUpdateStream();
/*! \brief Getter for the number of local atoms.
*
{
};
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
- const void* /* localStream */,
- const void* /* nonLocalStream */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */,
+ const DeviceStream* /* localStream */,
+ const DeviceStream* /* nonLocalStream */,
const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
int /* paddingSize */,
{
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream */,
const DeviceContext& /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
int /* paddingSize */,
}
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
{
GMX_ASSERT(false,
"A CPU stub method from GPU state propagator data was called instead of one from "
* \p pmeStream argument needs to be a valid OpenCL queue object
* which must have been created in \p deviceContext.
*
- * \todo Make a \p CommandStream visible in the CPU parts of the code so we
- * will not have to pass a void*.
- *
* \param[in] pmeStream Device PME stream, nullptr allowed.
* \param[in] localStream Device NBNXM local stream, nullptr allowed.
* \param[in] nonLocalStream Device NBNXM non-local stream, nullptr allowed.
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- Impl(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
+ Impl(const DeviceStream* pmeStream,
+ const DeviceStream* localStream,
+ const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] wcycle Wall cycle counter data.
*/
- Impl(const void* pmeStream,
+ Impl(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
*
* \returns The device command stream to use in update-constraints.
*/
- void* getUpdateStream();
+ const DeviceStream* getUpdateStream();
/*! \brief Getter for the number of local atoms.
*
private:
//! GPU PME stream.
- CommandStream pmeStream_ = nullptr;
+ const DeviceStream* pmeStream_;
//! GPU NBNXM local stream.
- CommandStream localStream_ = nullptr;
- //! GPU NBNXM non-local stream
- CommandStream nonLocalStream_ = nullptr;
+ const DeviceStream* localStream_;
+ //! GPU NBNXM non-local stream.
+ const DeviceStream* nonLocalStream_;
//! GPU Update-constreaints stream.
- CommandStream updateStream_ = nullptr;
+ const DeviceStream* updateStream_;
+
+ //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
+ DeviceStream updateStreamOwn_;
// Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
- EnumerationArray<AtomLocality, CommandStream> xCopyStreams_ = { { nullptr } };
+ EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
// Streams to use for velocities H2D and D2H copies (one event for each atom locality)
- EnumerationArray<AtomLocality, CommandStream> vCopyStreams_ = { { nullptr } };
+ EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
// Streams to use for forces H2D and D2H copies (one event for each atom locality)
- EnumerationArray<AtomLocality, CommandStream> fCopyStreams_ = { { nullptr } };
+ EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
/*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
*
* \param[in] h_data Host-side buffer.
* \param[in] dataSize Device-side data allocation size.
* \param[in] atomLocality If all, local or non-local ranges should be copied.
- * \param[in] commandStream GPU stream to execute copy in.
+ * \param[in] deviceStream GPU stream to execute copy in.
*/
void copyToDevice(DeviceBuffer<RVec> d_data,
gmx::ArrayRef<const gmx::RVec> h_data,
int dataSize,
AtomLocality atomLocality,
- CommandStream commandStream);
+ const DeviceStream& deviceStream);
/*! \brief Performs the copy of data from device to host buffer.
*
* \param[in] d_data Device-side buffer.
* \param[in] dataSize Device-side data allocation size.
* \param[in] atomLocality If all, local or non-local ranges should be copied.
- * \param[in] commandStream GPU stream to execute copy in.
+ * \param[in] deviceStream GPU stream to execute copy in.
*/
void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
- CommandStream commandStream);
+ const DeviceStream& deviceStream);
};
} // namespace gmx
namespace gmx
{
-StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream,
+ const DeviceStream* localStream,
+ const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
// The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
- pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
- updateStream_ = *static_cast<const CommandStream*>(pmeStream);
+ pmeStream_ = pmeStream;
+ updateStream_ = pmeStream;
GMX_UNUSED_VALUE(localStream);
GMX_UNUSED_VALUE(nonLocalStream);
}
{
if (pmeStream != nullptr)
{
- pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
+ pmeStream_ = pmeStream;
}
if (localStream != nullptr)
{
- localStream_ = *static_cast<const CommandStream*>(localStream);
+ localStream_ = localStream;
}
if (nonLocalStream != nullptr)
{
- nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
+ nonLocalStream_ = nonLocalStream;
}
// TODO: The update stream should be created only when it is needed.
# if (GMX_GPU == GMX_GPU_CUDA)
- cudaError_t stat;
- stat = cudaStreamCreate(&updateStream_);
+ cudaError_t stat;
+ cudaStream_t stream;
+ stat = cudaStreamCreate(&stream);
+ updateStreamOwn_.setStream(stream);
+ updateStream_ = &updateStreamOwn_;
CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
# endif
}
fCopyStreams_[AtomLocality::All] = updateStream_;
}
-StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
+StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
"This object should only be constructed on the GPU code-paths.");
GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
- pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
-
- localStream_ = nullptr;
+ pmeStream_ = pmeStream;
+ localStream_ = pmeStream; // For clearing the force buffer
nonLocalStream_ = nullptr;
updateStream_ = nullptr;
if (paddingAllocationSize > 0)
{
// The PME stream is used here because the padding region of d_x_ is only in the PME task.
- clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
+ clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
}
reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
// since the force buffer ops are not implemented in OpenCL.
if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
{
- clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+ clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
const gmx::ArrayRef<const gmx::RVec> h_data,
int dataSize,
AtomLocality atomLocality,
- CommandStream commandStream)
+ const DeviceStream& deviceStream)
{
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
- GMX_ASSERT(commandStream != nullptr,
+ GMX_ASSERT(deviceStream.stream() != nullptr,
"No stream is valid for copying with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
"The host buffer is smaller than the requested copy range.");
copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
- atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+ atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
- CommandStream commandStream)
+ const DeviceStream& deviceStream)
{
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
- GMX_ASSERT(commandStream != nullptr,
+ GMX_ASSERT(deviceStream.stream() != nullptr,
"No stream is valid for copying with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
"The host buffer is smaller than the requested copy range.");
copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
- atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
+ atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = xCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+ copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
// markEvent is skipped in OpenCL as:
// - it's not needed, copy is done in the same stream as the only consumer task (PME)
// TODO: remove this by adding an event-mark free flavor of this function
if (GMX_GPU == GMX_GPU_CUDA)
{
- xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
+ xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = xCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+ copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
// Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
- xReadyOnHost_[atomLocality].markEvent(commandStream);
+ xReadyOnHost_[atomLocality].markEvent(*deviceStream);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = vCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
- vReadyOnDevice_[atomLocality].markEvent(commandStream);
+ copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
+ vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = vCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
- vReadyOnHost_[atomLocality].markEvent(commandStream);
+ copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
+ vReadyOnHost_[atomLocality].markEvent(*deviceStream);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = fCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
- fReadyOnDevice_[atomLocality].markEvent(commandStream);
+ copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
+ fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = fCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
+ const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
- copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
- fReadyOnHost_[atomLocality].markEvent(commandStream);
+ copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
+ fReadyOnHost_[atomLocality].markEvent(*deviceStream);
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
}
-void* StatePropagatorDataGpu::Impl::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
{
- return &updateStream_;
+ return updateStream_;
}
int StatePropagatorDataGpu::Impl::numAtomsLocal()
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
- const void* localStream,
- const void* nonLocalStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream,
+ const DeviceStream* localStream,
+ const DeviceStream* nonLocalStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
{
}
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* pmeStream,
const DeviceContext& deviceContext,
GpuApiCallBehavior transferKind,
int paddingSize,
}
-void* StatePropagatorDataGpu::getUpdateStream()
+const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
{
return impl_->getUpdateStream();
}
*/
void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
- cudaStream_t stream = nb->stream[interactionLocality];
+ const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
/* When we get here all misc operations issued in the local stream as well as
the local xq H2D are done,
{
if (interactionLocality == InteractionLocality::Local)
{
- cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, stream);
+ cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
}
else
{
- cudaError_t stat = cudaStreamWaitEvent(stream, nb->misc_ops_and_local_H2D_done, 0);
+ cudaError_t stat =
+ cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
}
}
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
- cu_atomdata_t* adat = nb->atdat;
- cu_plist_t* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
- cudaStream_t stream = nb->stream[iloc];
+ cu_atomdata_t* adat = nb->atdat;
+ cu_plist_t* plist = nb->plist[iloc];
+ cu_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
cu_copy_H2D_async(adat->xq + adat_begin,
static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
- adat_len * sizeof(*adat->xq), stream);
+ adat_len * sizeof(*adat->xq), deviceStream.stream());
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issued in the local stream as well as
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
{
- cu_atomdata_t* adat = nb->atdat;
- cu_nbparam_t* nbp = nb->nbparam;
- cu_plist_t* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
- cudaStream_t stream = nb->stream[iloc];
+ cu_atomdata_t* adat = nb->atdat;
+ cu_nbparam_t* nbp = nb->nbparam;
+ cu_plist_t* plist = nb->plist[iloc];
+ cu_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(stream);
+ t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* Kernel launch config:
config.blockSize[2] = num_threads_z;
config.gridSize[0] = nblock;
config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
- config.stream = stream;
+ config.stream = deviceStream.stream();
if (debug)
{
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
if (GMX_NATIVE_WINDOWS)
{
/* Windows: force flushing WDDM queue */
- cudaStreamQuery(stream);
+ cudaStreamQuery(deviceStream.stream());
}
}
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cu_atomdata_t* adat = nb->atdat;
- cu_nbparam_t* nbp = nb->nbparam;
- cu_plist_t* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
- cudaStream_t stream = nb->stream[iloc];
+ cu_atomdata_t* adat = nb->atdat;
+ cu_nbparam_t* nbp = nb->nbparam;
+ cu_plist_t* plist = nb->plist[iloc];
+ cu_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed prune calculation section */
if (bDoTime)
{
- timer->openTimingRegion(stream);
+ timer->openTimingRegion(deviceStream);
}
/* Kernel launch config:
config.blockSize[2] = num_threads_z;
config.gridSize[0] = nblock;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = stream;
+ config.stream = deviceStream.stream();
if (debug)
{
if (bDoTime)
{
- timer->closeTimingRegion(stream);
+ timer->closeTimingRegion(deviceStream);
}
if (GMX_NATIVE_WINDOWS)
{
/* Windows: force flushing WDDM queue */
- cudaStreamQuery(stream);
+ cudaStreamQuery(deviceStream.stream());
}
}
const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
/* extract the data */
- cu_atomdata_t* adat = nb->atdat;
- cu_timers_t* t = nb->timers;
- bool bDoTime = nb->bDoTime;
- cudaStream_t stream = nb->stream[iloc];
+ cu_atomdata_t* adat = nb->atdat;
+ cu_timers_t* t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+ t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
kernel has finished. */
if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
{
- stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
+ stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
if (!stepWork.useGpuFBufferOps)
{
cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
- (adat_len) * sizeof(*adat->f), stream);
+ (adat_len) * sizeof(*adat->f), deviceStream.stream());
}
/* After the non-local D2H is launched the nonlocal_done event can be
back first. */
if (iloc == InteractionLocality::NonLocal)
{
- stat = cudaEventRecord(nb->nonlocal_done, stream);
+ stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
}
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream);
+ cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
+ deviceStream.stream());
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream);
- cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), stream);
+ cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
+ cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
}
}
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+ t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
}
}
const int numAtomsPerCell = grid.numAtomsPerCell();
Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
- cudaStream_t stream = nb->stream[interactionLoc];
+ const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
// avoid empty kernel launch, skip to inserting stream dependency
// ensure that coordinates are ready on the device before launching the kernel
GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
- xReadyOnDevice->enqueueWaitEvent(stream);
+ xReadyOnDevice->enqueueWaitEvent(deviceStream);
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
GMX_ASSERT(config.gridSize[0] > 0,
"Can not have empty grid, early return above avoids this");
config.sharedMemorySize = 0;
- config.stream = stream;
+ config.stream = deviceStream.stream();
auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
: nbnxn_gpu_x_to_nbat_x_kernel<false>;
launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
}
- // TODO: note that this is not necessary when there are no local atoms, that is:
+ // TODO: note that this is not necessary when there astreamre no local atoms, that is:
// (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
// but for now we avoid that optimization
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
- const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- cudaStream_t stream = nb->stream[iLocality];
- cu_atomdata_t* adat = nb->atdat;
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+ const DeviceStream& deviceStream = nb->deviceStreams[iLocality];
+ cu_atomdata_t* adat = nb->atdat;
size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
+ static_cast<size_t>((accumulateForce == true));
// Enqueue wait on all dependencies passed
for (auto const synchronizer : dependencyList)
{
- synchronizer->enqueueWaitEvent(stream);
+ synchronizer->enqueueWaitEvent(deviceStream);
}
/* launch kernel */
config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = stream;
+ config.stream = deviceStream.stream();
auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
: nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
{
GMX_ASSERT(nb->localFReductionDone != nullptr,
"localFReductionDone has to be a valid pointer");
- nb->localFReductionDone->markEvent(stream);
+ nb->localFReductionDone->markEvent(deviceStream);
}
}
nb->deviceInfo = deviceInfo;
/* local/non-local GPU streams */
- stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+ cudaStream_t localStream;
+ stat = cudaStreamCreate(&localStream);
+ nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
if (nb->bUseTwoStreams)
{
stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
- stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
- cudaStreamDefault, highest_priority);
+ cudaStream_t nonLocalStream;
+ stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
+ nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
CU_RET_ERR(stat,
"cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
}
void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
{
- char sbuf[STRLEN];
- bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
- cudaStream_t stream = nb->stream[iloc];
- cu_plist_t* d_plist = nb->plist[iloc];
+ char sbuf[STRLEN];
+ bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ cu_plist_t* d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
{
if (bDoTime)
{
- iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.pl_h2d.openTimingRegion(deviceStream);
iTimers.didPairlistH2D = true;
}
reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
DeviceContext());
- copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+ copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
DeviceContext());
- copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+ copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
&d_plist->excl_nalloc, DeviceContext());
- copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+ copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- iTimers.pl_h2d.closeTimingRegion(stream);
+ iTimers.pl_h2d.closeTimingRegion(deviceStream);
}
/* the next use of thist list we be the first one, so we need to prune */
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
{
- cudaError_t stat;
- int nalloc, natoms;
- bool realloced;
- bool bDoTime = nb->bDoTime;
- cu_timers_t* timers = nb->timers;
- cu_atomdata_t* d_atdat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaError_t stat;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t* timers = nb->timers;
+ cu_atomdata_t* d_atdat = nb->atdat;
+ const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
if (bDoTime)
{
/* time async copy */
- timers->atdat.openTimingRegion(ls);
+ timers->atdat.openTimingRegion(deviceStream);
}
/* need to reallocate if we have to copy more atoms than the amount of space
if (useLjCombRule(nb->nbparam))
{
cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
- natoms * sizeof(*d_atdat->lj_comb), ls);
+ natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
}
else
{
cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
- natoms * sizeof(*d_atdat->atom_types), ls);
+ natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
}
if (bDoTime)
{
- timers->atdat.closeTimingRegion(ls);
+ timers->atdat.closeTimingRegion(deviceStream);
}
}
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
delete nb->timers;
- if (nb->bDoTime)
- {
- /* The non-local counters/stream (second in the array) are needed only with DD. */
- for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
- {
- stat = cudaStreamDestroy(nb->stream[i]);
- CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
- }
- }
if (!useLjCombRule(nb->nbparam))
{
return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
}
-void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
{
assert(nb);
- return static_cast<void*>(&nb->stream[iloc]);
+ return &nb->deviceStreams[iloc];
}
void* gpu_get_xq(NbnxmGpu* nb)
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
{
- cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
- bool bDoTime = gpu_nbv->bDoTime;
- const int maxNumColumns = gridSet.numColumnsMax();
+ const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+ bool bDoTime = gpu_nbv->bDoTime;
+ const int maxNumColumns = gridSet.numColumnsMax();
reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
&gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
- copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream,
+ copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream,
GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
}
{
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
- copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
- copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
}
}
GpuEventSynchronizer* const localReductionDone)
{
- cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+ const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
gpu_nbv->localFReductionDone = localReductionDone;
{
reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
DeviceContext());
- copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
}
return;
/*! \brief staging area where fshift/energies get downloaded */
nb_staging_t nbst;
/*! \brief local and non-local GPU streams */
- gmx::EnumerationArray<Nbnxm::InteractionLocality, cudaStream_t> stream = { { nullptr } };
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
/*! \brief Events used for synchronization */
/*! \{ */
// GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
- if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+ if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
{
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
}
else if (haveResultToWaitFor)
{
- gpuStreamSynchronize(nb->stream[iLocality]);
+ nb->deviceStreams[iLocality].synchronize();
}
// TODO: this needs to be moved later because conditional wait could brake timing
#include "gromacs/mdtypes/locality.h"
class DeviceContext;
+class DeviceStream;
struct NbnxmGpu;
struct gmx_gpu_info_t;
* Note: CUDA only.
*/
CUDA_FUNC_QUALIFIER
-void* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu gmx_unused* nb, gmx::InteractionLocality gmx_unused iloc)
CUDA_FUNC_TERM_WITH_RETURN(nullptr);
/** Returns an opaque pointer to the GPU coordinate+charge array
/* local/nonlocal offset and length used for xq and f */
int adat_begin, adat_len;
- cl_atomdata_t* adat = nb->atdat;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
/* HtoD x, q */
- ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4,
- adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream,
+ ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
+ adat_len * sizeof(float) * 4, deviceStream.stream(),
bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issues in the local stream as well as
if (iloc == InteractionLocality::Local)
{
cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
- stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+ deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
GMX_ASSERT(cl_error == CL_SUCCESS,
("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
* in the local stream in order to be able to sync with the above event
* from the non-local stream.
*/
- cl_error = clFlush(stream);
+ cl_error = clFlush(deviceStream.stream());
GMX_ASSERT(cl_error == CL_SUCCESS,
("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
}
else
{
- sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+ sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
}
}
}
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
{
- cl_atomdata_t* adat = nb->atdat;
- cl_nbparam_t* nbp = nb->nbparam;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_nbparam_t* nbp = nb->nbparam;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(stream);
+ t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
- config.stream = stream;
+ config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.gridSize[0] = plist->nsci;
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
}
*/
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cl_atomdata_t* adat = nb->atdat;
- cl_nbparam_t* nbp = nb->nbparam;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
- bool bDoTime = nb->bDoTime;
+ cl_atomdata_t* adat = nb->atdat;
+ cl_nbparam_t* nbp = nb->nbparam;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ bool bDoTime = nb->bDoTime;
if (plist->haveFreshList)
{
/* beginning of timed prune calculation section */
if (bDoTime)
{
- timer->openTimingRegion(stream);
+ timer->openTimingRegion(deviceStream);
}
/* Kernel launch config:
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = stream;
+ config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.blockSize[2] = num_threads_z;
if (bDoTime)
{
- timer->closeTimingRegion(stream);
+ timer->closeTimingRegion(deviceStream);
}
}
/* determine interaction locality from atom locality */
const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
- cl_atomdata_t* adat = nb->atdat;
- cl_timers_t* t = nb->timers;
- bool bDoTime = nb->bDoTime;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_timers_t* t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[aloc].nb_d2h.openTimingRegion(stream);
+ t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
{
- sync_ocl_event(stream, &(nb->nonlocal_done));
+ sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
}
/* DtoH f */
ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
- adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream,
+ adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
- cl_error = clFlush(stream);
+ cl_error = clFlush(deviceStream.stream());
GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
/* After the non-local D2H is launched the nonlocal_done event can be
data back first. */
if (iloc == InteractionLocality::NonLocal)
{
- cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+ cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
GMX_ASSERT(cl_error == CL_SUCCESS,
("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
nb->bNonLocalStreamActive = CL_TRUE;
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]),
- stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+ SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream,
+ ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream,
+ ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
}
if (bDoTime)
{
- t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+ t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
}
}
cl_int cl_error;
cl_atomdata_t* adat = nb->atdat;
- cl_command_queue ls = nb->stream[InteractionLocality::Local];
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
size_t local_work_size[3] = { 1, 1, 1 };
size_t global_work_size[3] = { 1, 1, 1 };
queue_properties = 0;
}
- /* local/non-local GPU streams */
- nb->stream[InteractionLocality::Local] =
+ cl_command_queue localStream =
clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+ /* local/non-local GPU streams */
+ nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
+
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
{
init_plist(nb->plist[InteractionLocality::NonLocal]);
- nb->stream[InteractionLocality::NonLocal] =
+ cl_command_queue nonLocalStream =
clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+ nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
+
if (CL_SUCCESS != cl_error)
{
gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
cl_int gmx_used_in_debug cl_error;
cl_atomdata_t* atomData = nb->atdat;
- cl_command_queue ls = nb->stream[InteractionLocality::Local];
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
cl_float value = 0.0F;
cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
/* kick off buffer clearing kernel to ensure concurrency with constraints/update */
cl_int gmx_unused cl_error;
- cl_error = clFlush(nb->stream[InteractionLocality::Local]);
+ cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
}
// Timing accumulation should happen only if there was work to do
// because getLastRangeTime() gets skipped with empty lists later
// which leads to the counter not being reset.
- bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
- cl_command_queue stream = nb->stream[iloc];
- cl_plist_t* d_plist = nb->plist[iloc];
+ bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ cl_plist_t* d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
{
if (bDoTime)
{
- iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.pl_h2d.openTimingRegion(deviceStream);
iTimers.didPairlistH2D = true;
}
reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
deviceContext);
- copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+ copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
deviceContext);
- copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+ copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
&d_plist->excl_nalloc, deviceContext);
- copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+ copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- iTimers.pl_h2d.closeTimingRegion(stream);
+ iTimers.pl_h2d.closeTimingRegion(deviceStream);
}
/* need to prune the pair list during the next step */
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
cl_atomdata_t* adat = nb->atdat;
- cl_command_queue ls = nb->stream[InteractionLocality::Local];
+ cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local].stream();
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
//! This function is documented in the header file
void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
{
- cl_int cl_error;
- int nalloc, natoms;
- bool realloced;
- bool bDoTime = nb->bDoTime;
- cl_timers_t* timers = nb->timers;
- cl_atomdata_t* d_atdat = nb->atdat;
- cl_command_queue ls = nb->stream[InteractionLocality::Local];
+ cl_int cl_error;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = nb->bDoTime;
+ cl_timers_t* timers = nb->timers;
+ cl_atomdata_t* d_atdat = nb->atdat;
+ const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
if (bDoTime)
{
/* time async copy */
- timers->atdat.openTimingRegion(ls);
+ timers->atdat.openTimingRegion(deviceStream);
}
/* need to reallocate if we have to copy more atoms than the amount of space
if (useLjCombRule(nb->nbparam->vdwtype))
{
ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2),
- ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
else
{
ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int),
- ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
if (bDoTime)
{
- timers->atdat.closeTimingRegion(ls);
+ timers->atdat.closeTimingRegion(deviceStream);
}
/* kick off the tasks enqueued above to ensure concurrency with the search */
- cl_error = clFlush(ls);
+ cl_error = clFlush(deviceStream.stream());
GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
}
pfree(nb->nbst.fshift);
nb->nbst.fshift = nullptr;
- /* Free command queues */
- clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
- nb->stream[InteractionLocality::Local] = nullptr;
- if (nb->bUseTwoStreams)
- {
- clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
- nb->stream[InteractionLocality::NonLocal] = nullptr;
- }
/* Free other events */
if (nb->nonlocal_done)
{
nb_staging_t nbst;
//! local and non-local GPU queues
- gmx::EnumerationArray<Nbnxm::InteractionLocality, cl_command_queue> stream;
+ gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
/*! \brief Events used for synchronization */
/*! \{ */