}
}
-void gpu_launch_cpyback(NbnxmGpu* nb,
- nbnxn_atomdata_t* nbatom,
- const gmx::StepWorkload& stepWork,
- const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- /* determine interaction locality from atom locality */
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
- GMX_ASSERT(iloc == InteractionLocality::Local
- || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
- "Non-local stream is indicating that the copy back event is enqueued at the "
- "beginning of the copy back function.");
-
- /* extract the data */
- NBAtomData* adat = nb->atdat;
- Nbnxm::GpuTimers* timers = nb->timers;
- bool bDoTime = nb->bDoTime;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- /* don't launch non-local copy-back if there was no non-local work to do */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- nb->bNonLocalStreamDoneMarked = false;
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* beginning of timed D2H section */
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
- }
-
- /* With DD the local D2H transfer can only start after the non-local
- kernel has finished. */
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
- {
- nb->nonlocal_done.enqueueWaitEvent(deviceStream);
- nb->bNonLocalStreamDoneMarked = false;
- }
-
- /* DtoH f
- * Skip if buffer ops / reduction is offloaded to the GPU.
- */
- if (!stepWork.useGpuFBufferOps)
- {
- static_assert(
- sizeof(adat->f[0]) == sizeof(Float3),
- "The size of the force buffer element should be equal to the size of float3.");
- copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
- &adat->f,
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
-
- /* After the non-local D2H is launched the nonlocal_done event can be
- recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we want the non-local data
- back first. */
- if (iloc == InteractionLocality::NonLocal)
- {
- nb->nonlocal_done.markEvent(deviceStream);
- nb->bNonLocalStreamDoneMarked = true;
- }
-
- /* only transfer energies in the local stream */
- if (iloc == InteractionLocality::Local)
- {
- /* DtoH fshift when virial is needed */
- if (stepWork.computeVirial)
- {
- static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]),
- "Sizes of host- and device-side shift vectors should be the same.");
- copyFromDeviceBuffer(
- nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
- }
-
- /* DtoH energies */
- if (stepWork.computeEnergy)
- {
- static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]),
- "Sizes of host- and device-side LJ energy terms should be the same.");
- copyFromDeviceBuffer(
- nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]),
- "Sizes of host- and device-side electrostatic energy terms should be the "
- "same.");
- copyFromDeviceBuffer(
- nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- }
- }
-
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
- }
-}
-
void cuda_set_cacheconfig()
{
cudaError_t stat;
#include "gromacs/hardware/device_information.h"
#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/nbnxm/gpu_common_utils.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/exceptions.h"
return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
}
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(NbnxmGpu* nb,
+ struct nbnxn_atomdata_t* nbatom,
+ const gmx::StepWorkload& stepWork,
+ const AtomLocality atomLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ /* determine interaction locality from atom locality */
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+ GMX_ASSERT(iloc == InteractionLocality::Local
+ || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+ "Non-local stream is indicating that the copy back event is enqueued at the "
+ "beginning of the copy back function.");
+
+ /* extract the data */
+ NBAtomData* adat = nb->atdat;
+ Nbnxm::GpuTimers* timers = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ /* TODO An alternative way to signal that non-local work is
+ complete is to use a clEnqueueMarker+clEnqueueBarrier
+ pair. However, the use of bNonLocalStreamDoneMarked has the
+ advantage of being local to the host, so probably minimizes
+ overhead. Curiously, for NVIDIA OpenCL with an empty-domain
+ test case, overall simulation performance was higher with
+ the API calls, but this has not been tested on AMD OpenCL,
+ so could be worth considering in future. */
+ nb->bNonLocalStreamDoneMarked = false;
+ return;
+ }
+
+ /* local/nonlocal offset and length used for xq and f */
+ auto atomsRange = getGpuAtomRange(adat, atomLocality);
+
+ /* beginning of timed D2H section */
+ if (bDoTime)
+ {
+ timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+ }
+
+ /* With DD the local D2H transfer can only start after the non-local
+ has been launched. */
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
+ {
+ nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = false;
+ }
+
+ /* DtoH f */
+ static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+ "The host force buffer should be in single precision to match device data size.");
+ copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
+ &adat->f,
+ atomsRange.begin(),
+ atomsRange.size(),
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+
+ issueClFlushInStream(deviceStream);
+
+ /* After the non-local D2H is launched the nonlocal_done event can be
+ recorded which signals that the local D2H can proceed. This event is not
+ placed after the non-local kernel because we first need the non-local
+ data back first. */
+ if (iloc == InteractionLocality::NonLocal)
+ {
+ nb->nonlocal_done.markEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = true;
+ }
+
+ /* only transfer energies in the local stream */
+ if (iloc == InteractionLocality::Local)
+ {
+ /* DtoH fshift when virial is needed */
+ if (stepWork.computeVirial)
+ {
+ static_assert(
+ sizeof(*nb->nbst.fShift) == sizeof(Float3),
+ "Sizes of host- and device-side shift vector elements should be the same.");
+ copyFromDeviceBuffer(nb->nbst.fShift,
+ &adat->fShift,
+ 0,
+ SHIFTS,
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ }
+
+ /* DtoH energies */
+ if (stepWork.computeEnergy)
+ {
+ static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
+ "Sizes of host- and device-side LJ energy terms should be the same.");
+ copyFromDeviceBuffer(nb->nbst.eLJ,
+ &adat->eLJ,
+ 0,
+ 1,
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
+ "Sizes of host- and device-side electrostatic energy terms should be the "
+ "same.");
+ copyFromDeviceBuffer(nb->nbst.eElec,
+ &adat->eElec,
+ 0,
+ 1,
+ deviceStream,
+ GpuApiCallBehavior::Async,
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ }
+ }
+
+ if (bDoTime)
+ {
+ timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+ }
+}
+
void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
}
}
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu* nb,
- struct nbnxn_atomdata_t* nbatom,
- const gmx::StepWorkload& stepWork,
- const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- cl_int gmx_unused cl_error;
-
- /* determine interaction locality from atom locality */
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
- GMX_ASSERT(iloc == InteractionLocality::Local
- || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
- "Non-local stream is indicating that the copy back event is enqueued at the "
- "beginning of the copy back function.");
-
- NBAtomData* adat = nb->atdat;
- Nbnxm::GpuTimers* timers = nb->timers;
- bool bDoTime = nb->bDoTime;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- /* don't launch non-local copy-back if there was no non-local work to do */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- /* TODO An alternative way to signal that non-local work is
- complete is to use a clEnqueueMarker+clEnqueueBarrier
- pair. However, the use of bNonLocalStreamDoneMarked has the
- advantage of being local to the host, so probably minimizes
- overhead. Curiously, for NVIDIA OpenCL with an empty-domain
- test case, overall simulation performance was higher with
- the API calls, but this has not been tested on AMD OpenCL,
- so could be worth considering in future. */
- nb->bNonLocalStreamDoneMarked = false;
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* beginning of timed D2H section */
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
- }
-
- /* With DD the local D2H transfer can only start after the non-local
- has been launched. */
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
- {
- nb->nonlocal_done.enqueueWaitEvent(deviceStream);
- nb->bNonLocalStreamDoneMarked = false;
- }
-
- /* DtoH f */
- GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
- "The host force buffer should be in single precision to match device data size.");
- copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
- &adat->f,
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-
- /* kick off work */
- cl_error = clFlush(deviceStream.stream());
- GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
- /* After the non-local D2H is launched the nonlocal_done event can be
- recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we first need the non-local
- data back first. */
- if (iloc == InteractionLocality::NonLocal)
- {
- nb->nonlocal_done.markEvent(deviceStream);
- nb->bNonLocalStreamDoneMarked = true;
- }
-
- /* only transfer energies in the local stream */
- if (iloc == InteractionLocality::Local)
- {
- /* DtoH fshift when virial is needed */
- if (stepWork.computeVirial)
- {
- static_assert(
- sizeof(*nb->nbst.fShift) == sizeof(Float3),
- "Sizes of host- and device-side shift vector elements should be the same.");
- copyFromDeviceBuffer(nb->nbst.fShift,
- &adat->fShift,
- 0,
- SHIFTS,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
- }
-
- /* DtoH energies */
- if (stepWork.computeEnergy)
- {
- static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
- "Sizes of host- and device-side LJ energy terms should be the same.");
- copyFromDeviceBuffer(nb->nbst.eLJ,
- &adat->eLJ,
- 0,
- 1,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
- static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
- "Sizes of host- and device-side electrostatic energy terms should be the "
- "same.");
- copyFromDeviceBuffer(nb->nbst.eElec,
- &adat->eElec,
- 0,
- 1,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
- }
- }
-
- if (bDoTime)
- {
- timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
- }
-}
-
} // namespace Nbnxm
namespace Nbnxm
{
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu* nb,
- struct nbnxn_atomdata_t* nbatom,
- const gmx::StepWorkload& stepWork,
- const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
- GMX_ASSERT(iloc == InteractionLocality::Local
- || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
- "Non-local stream is indicating that the copy back event is enqueued at the "
- "beginning of the copy back function.");
-
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
- NBAtomData* adat = nb->atdat;
-
- /* don't launch non-local copy-back if there was no non-local work to do */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- nb->bNonLocalStreamDoneMarked = false;
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- // With DD the local D2H transfer can only start after the non-local kernel has finished.
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
- {
- nb->nonlocal_done.waitForEvent();
- nb->bNonLocalStreamDoneMarked = false;
- }
-
- /* DtoH f
- * Skip if buffer ops / reduction is offloaded to the GPU.
- */
- if (!stepWork.useGpuFBufferOps)
- {
- GMX_ASSERT(adat->f.elementSize() == sizeof(Float3),
- "The size of the force buffer element should be equal to the size of float3.");
- copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
- &adat->f,
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- nullptr);
- }
-
- /* After the non-local D2H is launched the nonlocal_done event can be
- recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we want the non-local data
- back first. */
- if (iloc == InteractionLocality::NonLocal)
- {
- nb->nonlocal_done.markEvent(deviceStream);
- nb->bNonLocalStreamDoneMarked = true;
- }
-
- /* only transfer energies in the local stream */
- if (iloc == InteractionLocality::Local)
- {
- /* DtoH fshift when virial is needed */
- if (stepWork.computeVirial)
- {
- GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(),
- "Sizes of host- and device-side shift vector elements should be the same.");
- copyFromDeviceBuffer(
- nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
- }
-
- /* DtoH energies */
- if (stepWork.computeEnergy)
- {
- GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float),
- "Sizes of host- and device-side LJ energy terms should be the same.");
- copyFromDeviceBuffer(
- nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float),
- "Sizes of host- and device-side electrostatic energy terms should be the "
- "same.");
- copyFromDeviceBuffer(
- nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- }
- }
-}
-
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
gpu_plist* plist = nb->plist[iloc];