DeviceBuffer<RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice)
{
+ GMX_ASSERT(xReadyOnDevice != nullptr, "Need a valid GpuEventSynchronizer object");
int gridBegin = 0;
int gridEnd = 0;
for (int g = gridBegin; g < gridEnd; g++)
{
- nbnxn_gpu_x_to_nbat_x(
- gridSet.grids()[g], gpu_nbv, d_x, xReadyOnDevice, locality, g, gridSet.numColumnsMax());
+ nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+ gpu_nbv,
+ d_x,
+ (g == gridBegin) ? xReadyOnDevice : nullptr, // Sync on first iteration only
+ locality,
+ g,
+ gridSet.numColumnsMax(),
+ (g == gridEnd - 1));
}
}
GpuEventSynchronizer* xReadyOnDevice,
const Nbnxm::AtomLocality locality,
int gridId,
- int numColumnsMax)
+ int numColumnsMax,
+ bool mustInsertNonLocalDependency)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
+ if (xReadyOnDevice != nullptr)
+ {
+ // We only need to wait on the first iteration of the loop
+ xReadyOnDevice->enqueueWaitEvent(deviceStream);
+ }
+
int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
// avoid empty kernel launch, skip to inserting stream dependency
if (numAtoms != 0)
// TODO: This will only work with CUDA
GMX_ASSERT(d_x, "Need a valid device pointer");
- // ensure that coordinates are ready on the device before launching the kernel
- GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
- xReadyOnDevice->enqueueWaitEvent(deviceStream);
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
}
- // TODO: note that this is not necessary when there are no local atoms, that is:
- // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
- // but for now we avoid that optimization
- nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+ if (mustInsertNonLocalDependency)
+ {
+ Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+ }
}
DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
* \param[in] locality Copy coordinates for local or non-local atoms.
* \param[in] gridId Index of the grid being converted.
* \param[in] numColumnsMax Maximum number of columns in the grid.
+ * \param[in] mustInsertNonLocalDependency Whether synchronization between local and non-local
+ * streams should be added. Typically, true if and only if that is the last grid in gridset.
*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
GpuEventSynchronizer gmx_unused* xReadyOnDevice,
gmx::AtomLocality gmx_unused locality,
int gmx_unused gridId,
- int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
+ int gmx_unused numColumnsMax,
+ bool gmx_unused mustInsertNonLocalDependency) CUDA_FUNC_TERM;
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
*