/* local/nonlocal offset and length used for xq and f */
int adat_begin, adat_len;
- cl_atomdata_t* adat = nb->atdat;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
/* HtoD x, q */
- ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4,
- adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream,
+ ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
+ adat_len * sizeof(float) * 4, deviceStream.stream(),
bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issues in the local stream as well as
if (iloc == InteractionLocality::Local)
{
cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
- stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+ deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
GMX_ASSERT(cl_error == CL_SUCCESS,
("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
* in the local stream in order to be able to sync with the above event
* from the non-local stream.
*/
- cl_error = clFlush(stream);
+ cl_error = clFlush(deviceStream.stream());
GMX_ASSERT(cl_error == CL_SUCCESS,
("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
}
else
{
- sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+ sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
}
}
}
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
{
- cl_atomdata_t* adat = nb->atdat;
- cl_nbparam_t* nbp = nb->nbparam;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_nbparam_t* nbp = nb->nbparam;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(stream);
+ t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
- config.stream = stream;
+ config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.gridSize[0] = plist->nsci;
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
}
*/
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cl_atomdata_t* adat = nb->atdat;
- cl_nbparam_t* nbp = nb->nbparam;
- cl_plist_t* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- cl_command_queue stream = nb->stream[iloc];
- bool bDoTime = nb->bDoTime;
+ cl_atomdata_t* adat = nb->atdat;
+ cl_nbparam_t* nbp = nb->nbparam;
+ cl_plist_t* plist = nb->plist[iloc];
+ cl_timers_t* t = nb->timers;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+ bool bDoTime = nb->bDoTime;
if (plist->haveFreshList)
{
/* beginning of timed prune calculation section */
if (bDoTime)
{
- timer->openTimingRegion(stream);
+ timer->openTimingRegion(deviceStream);
}
/* Kernel launch config:
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = stream;
+ config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.blockSize[2] = num_threads_z;
if (bDoTime)
{
- timer->closeTimingRegion(stream);
+ timer->closeTimingRegion(deviceStream);
}
}
/* determine interaction locality from atom locality */
const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
- cl_atomdata_t* adat = nb->atdat;
- cl_timers_t* t = nb->timers;
- bool bDoTime = nb->bDoTime;
- cl_command_queue stream = nb->stream[iloc];
+ cl_atomdata_t* adat = nb->atdat;
+ cl_timers_t* t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ const DeviceStream& deviceStream = nb->deviceStreams[iloc];
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[aloc].nb_d2h.openTimingRegion(stream);
+ t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
{
- sync_ocl_event(stream, &(nb->nonlocal_done));
+ sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
}
/* DtoH f */
ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
- adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream,
+ adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
- cl_error = clFlush(stream);
+ cl_error = clFlush(deviceStream.stream());
GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
/* After the non-local D2H is launched the nonlocal_done event can be
data back first. */
if (iloc == InteractionLocality::NonLocal)
{
- cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+ cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
GMX_ASSERT(cl_error == CL_SUCCESS,
("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
nb->bNonLocalStreamActive = CL_TRUE;
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]),
- stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+ SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream,
+ ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream,
+ ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
}
if (bDoTime)
{
- t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+ t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
}
}