nb->deviceInfo = deviceInfo;
/* local/non-local GPU streams */
- stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+ cudaStream_t localStream;
+ stat = cudaStreamCreate(&localStream);
+ nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
if (nb->bUseTwoStreams)
{
stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
- stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
- cudaStreamDefault, highest_priority);
+ cudaStream_t nonLocalStream;
+ stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
+ nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
CU_RET_ERR(stat,
"cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
}
void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
{
- char sbuf[STRLEN];
- bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
- cudaStream_t stream = nb->stream[iloc];
- cu_plist_t* d_plist = nb->plist[iloc];
+ char sbuf[STRLEN];
+ bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ cu_plist_t* d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
{
if (bDoTime)
{
- iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.pl_h2d.openTimingRegion(deviceStream);
iTimers.didPairlistH2D = true;
}
reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
DeviceContext());
- copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+ copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
DeviceContext());
- copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+ copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
&d_plist->excl_nalloc, DeviceContext());
- copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+ copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- iTimers.pl_h2d.closeTimingRegion(stream);
+ iTimers.pl_h2d.closeTimingRegion(deviceStream);
}
/* the next use of thist list we be the first one, so we need to prune */
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
{
cudaError_t stat;
cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local].stream();
stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
{
- cudaError_t stat;
- int nalloc, natoms;
- bool realloced;
- bool bDoTime = nb->bDoTime;
- cu_timers_t* timers = nb->timers;
- cu_atomdata_t* d_atdat = nb->atdat;
- cudaStream_t ls = nb->stream[InteractionLocality::Local];
+ cudaError_t stat;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t* timers = nb->timers;
+ cu_atomdata_t* d_atdat = nb->atdat;
+ const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
if (bDoTime)
{
/* time async copy */
- timers->atdat.openTimingRegion(ls);
+ timers->atdat.openTimingRegion(deviceStream);
}
/* need to reallocate if we have to copy more atoms than the amount of space
if (useLjCombRule(nb->nbparam))
{
cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
- natoms * sizeof(*d_atdat->lj_comb), ls);
+ natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
}
else
{
cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
- natoms * sizeof(*d_atdat->atom_types), ls);
+ natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
}
if (bDoTime)
{
- timers->atdat.closeTimingRegion(ls);
+ timers->atdat.closeTimingRegion(deviceStream);
}
}
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
delete nb->timers;
- if (nb->bDoTime)
- {
- /* The non-local counters/stream (second in the array) are needed only with DD. */
- for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
- {
- stat = cudaStreamDestroy(nb->stream[i]);
- CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
- }
- }
if (!useLjCombRule(nb->nbparam))
{
return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
}
-void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
{
assert(nb);
- return static_cast<void*>(&nb->stream[iloc]);
+ return &nb->deviceStreams[iloc];
}
void* gpu_get_xq(NbnxmGpu* nb)
/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/
void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
{
- cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
- bool bDoTime = gpu_nbv->bDoTime;
- const int maxNumColumns = gridSet.numColumnsMax();
+ const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+ bool bDoTime = gpu_nbv->bDoTime;
+ const int maxNumColumns = gridSet.numColumnsMax();
reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
&gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
- copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream,
+ copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream,
GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
}
{
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
- copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
}
destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
- copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
- gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
}
}
}
GpuEventSynchronizer* const localReductionDone)
{
- cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+ const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
gpu_nbv->localFReductionDone = localReductionDone;
{
reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
DeviceContext());
- copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
}
return;