const rvec *coordinatesHost)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(coordinatesHost, "Need a valid host pointer");
bool bDoTime = nb->bDoTime;
Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality);
- int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
+ int numCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
int copyAtomStart = grid.srcAtomBegin();
cudaStream_t stream = nb->stream[interactionLoc];
// empty domain avoid launching zero-byte copy
- if (nCopyAtoms == 0)
+ if (numCopyAtoms == 0)
{
return;
}
+ GMX_ASSERT(coordinatesHost, "Need a valid host pointer");
if (bDoTime)
{
rvec *devicePtrDest = reinterpret_cast<rvec *> (nb->xrvec[copyAtomStart]);
const rvec *devicePtrSrc = reinterpret_cast<const rvec *> (coordinatesHost[copyAtomStart]);
- copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, nCopyAtoms,
+ copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
stream, GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
cudaStream_t stream = nb->stream[interactionLoc];
- // TODO: This will only work with CUDA
- GMX_ASSERT(coordinatesDevice, "Need a valid device pointer");
-
int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
// avoid empty kernel launch, skip to inserting stream dependency
if (numAtoms != 0)
{
+ // TODO: This will only work with CUDA
+ GMX_ASSERT(coordinatesDevice, "Need a valid device pointer");
+
KernelLaunchConfig config;
config.blockSize[0] = c_bufOpsThreadsPerBlock;
config.blockSize[1] = 1;
bool accumulateForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
+ GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
cudaStream_t stream = nb->stream[iLocality];
if (useGpuFPmeReduction)
{
+ GMX_ASSERT(pmeForcesDevice, "Need a valid pmeForcesDevice pointer");
kernelFn = accumulateForce ?
nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
rvec *f)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(f, "Need a valid f pointer");
const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
cudaStream_t stream = nb->stream[iLocality];
bool bDoTime = nb->bDoTime;
cu_timers_t *t = nb->timers;
- int atomStart = 0, nAtoms = 0;
+ int atomStart = 0, numCopyAtoms = 0;
+
+ nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
- nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+ // Avoiding launching copy with no work
+ if (numCopyAtoms == 0)
+ {
+ return;
+ }
+ GMX_ASSERT(f, "Need a valid f pointer");
if (bDoTime)
{
rvec *ptrDest = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
rvec *ptrSrc = reinterpret_cast<rvec *> (f[atomStart]);
- //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, nAtoms,
+ //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, numCopyAtoms,
// stream, GpuApiCallBehavior::Async, nullptr);
//TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
- cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
+ cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
stream);
if (bDoTime)
rvec *f)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(f, "Need a valid f pointer");
const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
cudaStream_t stream = nb->stream[iLocality];
bool bDoTime = nb->bDoTime;
cu_timers_t *t = nb->timers;
- int atomStart, nAtoms;
+ int atomStart, numCopyAtoms;
+
+ nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
- nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+ // Avoiding launching copy with no work
+ if (numCopyAtoms == 0)
+ {
+ return;
+ }
+ GMX_ASSERT(f, "Need a valid f pointer");
if (bDoTime)
{
GMX_ASSERT(nb->frvec, "Need a valid nb->frvec pointer");
rvec *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
rvec *ptrSrc = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
- //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+ //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, numCopyAtoms,
// stream, GpuApiCallBehavior::Async, nullptr);
//TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
- cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
+ cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
stream);
if (bDoTime)