config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = nonLocalStream_.stream();
const float3* sendBuf = d_sendBuf_;
const float3* d_x = d_x_;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x,
&indexMap, &size, &coordinateShift);
- launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+ launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr,
+ "Domdec GPU Apply X Halo Exchange", kernelArgs);
}
communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = nonLocalStream_.stream();
const float3* recvBuf = d_recvBuf_;
const int* indexMap = d_indexMap_;
const auto kernelArgs =
prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
- launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+ launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr,
+ "Domdec GPU Apply F Halo Exchange", kernelArgs);
}
if (pulse_ == 0)
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
- config.stream = pmeGpu->archSpecific->pmeStream_.stream();
int timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
&kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
+ launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent,
+ "PME spline/spread", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;
config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1)
/ gridLinesPerBlock;
config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
- config.stream = pmeGpu->archSpecific->pmeStream_.stream();
int timingId = gtPME_SOLVE;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli,
&kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid);
#endif
- launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs);
+ launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve",
+ kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
if (computeEnergyAndVirial)
config.blockSize[2] = atomsPerBlock;
config.gridSize[0] = dimGrid.first;
config.gridSize[1] = dimGrid.second;
- config.stream = pmeGpu->archSpecific->pmeStream_.stream();
// TODO test different cache configs
&kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
&kernelParamsPtr->atoms.d_forces);
#endif
- launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
+ launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather",
+ kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
if (pmeGpu->settings.useGpuForceReduction)
* \tparam Args Types of all the kernel arguments
* \param[in] kernel Kernel function handle
* \param[in] config Kernel configuration for launching
+ * \param[in] deviceStream GPU stream to launch kernel in
* \param[in] kernelName Human readable kernel description, for error handling only
* \param[in] kernelArgs Array of the pointers to the kernel arguments, prepared by
* prepareGpuKernelArguments() \throws gmx::InternalError on kernel launch failure
template<typename... Args>
void launchGpuKernel(void (*kernel)(Args...),
const KernelLaunchConfig& config,
+ const DeviceStream& deviceStream,
CommandEvent* /*timingEvent */,
const char* kernelName,
const std::array<void*, sizeof...(Args)>& kernelArgs)
dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
cudaLaunchKernel((void*)kernel, gridSize, blockSize, const_cast<void**>(kernelArgs.data()),
- config.sharedMemorySize, config.stream);
+ config.sharedMemorySize, deviceStream.stream());
cudaError_t status = cudaGetLastError();
if (cudaSuccess != status)
#include <cuda_profiler_api.h>
#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/basedefinitions.h"
try
{
KernelLaunchConfig config;
- config.blockSize[0] = 512;
- const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
- launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
+ config.blockSize[0] = 512;
+ const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
+ DeviceInformation deviceInfo;
+ const DeviceContext deviceContext(deviceInfo);
+ const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+ launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
}
catch (gmx::GromacsException& ex)
{
*/
struct KernelLaunchConfig
{
- size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts
- size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts
- size_t sharedMemorySize = 0; //!< Shared memory size in bytes
- cudaStream_t stream = nullptr; //!< Stream to launch kernel in
+ //! Block counts
+ size_t gridSize[3] = { 1, 1, 1 };
+ //! Per-block thread counts
+ size_t blockSize[3] = { 1, 1, 1 };
+ //! Shared memory size in bytes
+ size_t sharedMemorySize = 0;
};
//! Sets whether device code can use arrays that are embedded in structs.
*/
struct KernelLaunchConfig
{
- size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts
- size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts
- size_t sharedMemorySize = 0; //!< Shared memory size in bytes
- cl_command_queue stream = nullptr; //!< Stream to launch kernel in
+ //! Work groups (CUDA blocks) counts
+ size_t gridSize[3] = { 1, 1, 1 };
+ //! Per work group (CUDA block) thread counts
+ size_t blockSize[3] = { 1, 1, 1 };
+ //! Shared memory size in bytes
+ size_t sharedMemorySize = 0;
};
/*! \brief Sets whether device code can use arrays that are embedded in structs.
*
* \param[in] kernel Kernel function handle
* \param[in] config Kernel configuration for launching
+ * \param[in] deviceStream GPU stream to launch kernel in
* \param[in] timingEvent Timing event, fetched from GpuRegionTimer
* \param[in] kernelName Human readable kernel description, for error handling only
* \throws gmx::InternalError on kernel launch failure
*/
inline void launchGpuKernel(cl_kernel kernel,
const KernelLaunchConfig& config,
+ const DeviceStream& deviceStream,
CommandEvent* timingEvent,
const char* kernelName,
const void* /*kernelArgs*/)
{
globalWorkSize[i] = config.gridSize[i] * config.blockSize[i];
}
- cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset,
- globalWorkSize, config.blockSize, waitListSize,
- waitList, timingEvent);
+ cl_int clError = clEnqueueNDRangeKernel(deviceStream.stream(), kernel, workDimensions,
+ globalWorkOffset, globalWorkSize, config.blockSize,
+ waitListSize, waitList, timingEvent);
if (CL_SUCCESS != clError)
{
const std::string errorMessage = "GPU kernel (" + std::string(kernelName)
kernelLaunchConfig.blockSize[1] = 1;
kernelLaunchConfig.blockSize[2] = 1;
kernelLaunchConfig.sharedMemorySize = 0;
- kernelLaunchConfig.stream = deviceStream.stream();
auto kernelPtr = convertRVecToFloat3OnDevice_kernel;
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig,
&d_float3Output, &d_rVecInput, &numElements);
- launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs);
+ launchGpuKernel(kernelPtr, kernelLaunchConfig, deviceStream, nullptr,
+ "convertRVecToFloat3OnDevice_kernel", kernelArgs);
copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream,
GpuApiCallBehavior::Sync, nullptr);
config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
config.gridSize[1] = 1;
config.gridSize[2] = 1;
- config.stream = deviceStream_.stream();
auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
- launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
+ launchGpuKernel(kernelPtr, config, deviceStream_, nullptr, "exec_kernel_gpu<calcVir, calcEner>",
+ kernelArgs);
}
void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
const auto kernelArgs = prepareGpuKernelArguments(
kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
&dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
- launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
+ launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs);
return;
}
kernelLaunchConfig_.blockSize[1] = 1;
kernelLaunchConfig_.blockSize[2] = 1;
kernelLaunchConfig_.sharedMemorySize = 0;
- kernelLaunchConfig_.stream = deviceStream_.stream();
}
LeapFrogGpu::~LeapFrogGpu()
{
config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
}
- config.stream = deviceStream_.stream();
kernelParams_.pbcAiuc = pbcAiuc;
const auto kernelArgs =
prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
- launchGpuKernel(kernelPtr, config, nullptr, "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
+ launchGpuKernel(kernelPtr, config, deviceStream_, nullptr,
+ "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
if (computeVirial)
{
{
config.sharedMemorySize = 0;
}
- config.stream = deviceStream_.stream();
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_,
&settleParameters_, &d_x, &d_xp, &invdt, &d_v,
&d_virialScaled_, &pbcAiuc);
- launchGpuKernel(kernelPtr, config, nullptr, "settle_kernel<updateVelocities, computeVirial>", kernelArgs);
+ launchGpuKernel(kernelPtr, config, deviceStream_, nullptr,
+ "settle_kernel<updateVelocities, computeVirial>", kernelArgs);
if (computeVirial)
{
const auto kernelArgs = prepareGpuKernelArguments(
scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
- launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
- "scaleCoordinates_kernel", kernelArgs);
+ launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, deviceStream_,
+ nullptr, "scaleCoordinates_kernel", kernelArgs);
// TODO: Although this only happens on the pressure coupling steps, this synchronization
// can affect the perfornamce if nstpcouple is small.
deviceStream_.synchronize();
coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
coordinateScalingKernelLaunchConfig_.blockSize[2] = 1;
coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
- coordinateScalingKernelLaunchConfig_.stream = deviceStream_.stream();
}
UpdateConstrainGpu::Impl::~Impl() {}
config.blockSize[2] = num_threads_z;
config.gridSize[0] = nblock;
config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
- config.stream = deviceStream.stream();
if (debug)
{
(plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->deviceInfo);
const auto kernelArgs =
prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial);
- launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs);
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, "k_calc_nb", kernelArgs);
if (bDoTime)
{
config.blockSize[2] = num_threads_z;
config.gridSize[0] = nblock;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = deviceStream.stream();
if (debug)
{
const auto kernel =
plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
- launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
/* TODO: consider a more elegant way to track which kernel has been called
(combined or separate 1st pass prune, rolling prune). */
GMX_ASSERT(config.gridSize[0] > 0,
"Can not have empty grid, early return above avoids this");
config.sharedMemorySize = 0;
- config.stream = deviceStream.stream();
auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
: nbnxn_gpu_x_to_nbat_x_kernel<false>;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq,
&d_xFloat3, &d_atomIndices, &d_cxy_na,
&d_cxy_ind, &cellOffset, &numAtomsPerCell);
- launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
+ launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
}
// TODO: note that this is not necessary when there astreamre no local atoms, that is:
config.gridSize[1] = 1;
config.gridSize[2] = 1;
config.sharedMemorySize = 0;
- config.stream = deviceStream.stream();
auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
: nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
&d_cell, &atomStart, &numAtoms);
- launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
+ launchGpuKernel(kernelFn, config, deviceStream, nullptr, "FbufferOps", kernelArgs);
if (atomLocality == AtomLocality::Local)
{
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
- config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.gridSize[0] = plist->nsci;
&adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d,
&nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
- launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
}
else
{
&adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp_climg2d,
&nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4,
&plist->excl, &computeFshift);
- launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
}
if (bDoTime)
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
- config.stream = deviceStream.stream();
config.blockSize[0] = c_clSize;
config.blockSize[1] = c_clSize;
config.blockSize[2] = num_threads_z;
const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
&adat->xq, &adat->shift_vec, &plist->sci,
&plist->cj4, &plist->imask, &numParts, &part);
- launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+ launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
if (plist->haveFreshList)
{