On search steps we do not use x buffer ops, so the workload flag should
correctly reflect that.
Also slightly refactored a conditional block to clarify the scope of
workload flags.
Note that as a side-effect of this change, coordinate H2D copy will be
delayed from the beginning of do_force() to just before update on search
steps when there are no force tasks that require it (i.e. without PME).
While this is not ideal for performance, the code is easier to reason
about.
Refs #3915 #3913 #4268
GMX_ASSERT(simulationWork.useGpuNonbonded,
"Can only offload buffer ops if nonbonded computation is also offloaded");
}
GMX_ASSERT(simulationWork.useGpuNonbonded,
"Can only offload buffer ops if nonbonded computation is also offloaded");
}
- flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+ flags.useGpuXBufferOps = simulationWork.useGpuBufferOps && !flags.doNeighborSearch;
// on virial steps the CPU reduction path is taken
flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
// on virial steps the CPU reduction path is taken
flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
haveCopiedXFromGpu = true;
}
haveCopiedXFromGpu = true;
}
+ if (stepWork.doNeighborSearch && ((stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)))
+ {
+ // TODO refactor this to do_md, after partitioning.
+ stateGpu->reinit(mdatoms->homenr,
+ getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
+ if (stepWork.haveGpuPmeOnThisRank)
+ {
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
+ }
+ }
+
// Coordinates on the device are needed if PME or BufferOps are offloaded.
// The local coordinates can be copied right away.
// NOTE: Consider moving this copy to right after they are updated and constrained,
// if the later is not offloaded.
if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
{
// Coordinates on the device are needed if PME or BufferOps are offloaded.
// The local coordinates can be copied right away.
// NOTE: Consider moving this copy to right after they are updated and constrained,
// if the later is not offloaded.
if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
{
- if (stepWork.doNeighborSearch)
- {
- // TODO refactor this to do_md, after partitioning.
- stateGpu->reinit(mdatoms->homenr,
- getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
- if (stepWork.haveGpuPmeOnThisRank)
- {
- // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
- pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
- }
- }
// We need to copy coordinates when:
// 1. Update is not offloaded
// 2. The buffers were reinitialized on search step
// We need to copy coordinates when:
// 1. Update is not offloaded
// 2. The buffers were reinitialized on search step
wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
wallcycle_stop(wcycle, WallCycleCounter::NS);
wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
wallcycle_stop(wcycle, WallCycleCounter::NS);
- if (stepWork.useGpuXBufferOps)
+ if (simulationWork.useGpuBufferOps)
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
+ if (!stepWork.useGpuXBufferOps)
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
if (simulationWork.useGpuNonbonded)
{
if (simulationWork.useGpuNonbonded)
{
- if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
+ if (!stepWork.useGpuXBufferOps)
{
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
{
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);