#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
nbparams_params->vdw_switch = nbp->vdw_switch;
}
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
-{
- cl_int gmx_unused cl_error;
-
- /* Enqueue wait */
- cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
- GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
-
- /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
- cl_error = clReleaseEvent(*ocl_event);
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
- *ocl_event = nullptr;
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
- /* local/nonlocal offset and length used for xq and f */
- int adat_begin, adat_len;
-
- cl_atomdata_t* adat = nb->atdat;
- gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- bool bDoTime = nb->bDoTime;
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordinates.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- return;
- }
-
- /* calculate the atom data index range based on locality */
- if (atomLocality == AtomLocality::Local)
- {
- adat_begin = 0;
- adat_len = adat->natoms_local;
- }
- else
- {
- adat_begin = adat->natoms_local;
- adat_len = adat->natoms - adat->natoms_local;
- }
-
- /* beginning of timed HtoD section */
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
- }
-
- /* HtoD x, q */
- GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
- "The size of the xyzq buffer element should be equal to the size of float4.");
- copyToDeviceBuffer(&adat->xq,
- nbatom->x().data() + adat_begin * 4,
- adat_begin * 4,
- adat_len * 4,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
- if (bDoTime)
- {
- t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
- }
-
- /* When we get here all misc operations issues in the local stream as well as
- the local xq H2D are done,
- so we record that in the local stream and wait for it in the nonlocal one. */
- if (nb->bUseTwoStreams)
- {
- if (iloc == InteractionLocality::Local)
- {
- cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
- deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
- /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
- * in the local stream in order to be able to sync with the above event
- * from the non-local stream.
- */
- cl_error = clFlush(deviceStream.stream());
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
- }
- else
- {
- sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
- }
- }
-}
-
-
/*! \brief Launch GPU kernel
As we execute nonbonded workload in separate queues, before launching
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
{
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomDataGpu* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
fillin_ocl_structures(nbp, &nbparams_params);
- auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_calc_nb";
const auto kernel =
select_nbnxn_kernel(nb,
&nbparams_params,
&adat->xq,
&adat->f,
- &adat->e_lj,
- &adat->e_el,
- &adat->fshift,
- &adat->lj_comb,
- &adat->shift_vec,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->ljComb,
+ &adat->shiftVec,
&nbp->nbfp,
&nbp->nbfp_comb,
&nbp->coulomb_tab,
{
const auto kernelArgs = prepareGpuKernelArguments(kernel,
config,
- &adat->ntypes,
+ &adat->numTypes,
&nbparams_params,
&adat->xq,
&adat->f,
- &adat->e_lj,
- &adat->e_el,
- &adat->fshift,
- &adat->atom_types,
- &adat->shift_vec,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->atomTypes,
+ &adat->shiftVec,
&nbp->nbfp,
&nbp->nbfp_comb,
&nbp->coulomb_tab,
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
}
* for OpenCL local memory.
*
* \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
- * dimension. \returns the amount of local memory in bytes required by the pruning kernel
+ * dimension.
+ * \returns the amount of local memory in bytes required by the pruning kernel
*/
static inline int calc_shmem_required_prune(const int num_threads_z)
{
*/
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomDataGpu* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
* and j-cluster concurrency, in x, y, and z, respectively.
* - The 1D block-grid contains as many blocks as super-clusters.
*/
- int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT;
-
-
+ int num_threads_z = c_pruneKernelJ4Concurrency;
/* kernel launch config */
KernelLaunchConfig config;
config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
config,
&nbparams_params,
&adat->xq,
- &adat->shift_vec,
+ &adat->shiftVec,
&plist->sci,
&plist->cj4,
&plist->imask,
}
}
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu* nb,
- struct nbnxn_atomdata_t* nbatom,
- const gmx::StepWorkload& stepWork,
- const AtomLocality aloc)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
- cl_int gmx_unused cl_error;
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
- /* determine interaction locality from atom locality */
- const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
-
- cl_atomdata_t* adat = nb->atdat;
- cl_timers_t* t = nb->timers;
- bool bDoTime = nb->bDoTime;
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- /* don't launch non-local copy-back if there was no non-local work to do */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- /* TODO An alternative way to signal that non-local work is
- complete is to use a clEnqueueMarker+clEnqueueBarrier
- pair. However, the use of bNonLocalStreamActive has the
- advantage of being local to the host, so probably minimizes
- overhead. Curiously, for NVIDIA OpenCL with an empty-domain
- test case, overall simulation performance was higher with
- the API calls, but this has not been tested on AMD OpenCL,
- so could be worth considering in future. */
- nb->bNonLocalStreamActive = CL_FALSE;
- return;
- }
-
- getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
-
- /* beginning of timed D2H section */
- if (bDoTime)
- {
- t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
- }
-
- /* With DD the local D2H transfer can only start after the non-local
- has been launched. */
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
- {
- sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
- }
-
- /* DtoH f */
- GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
- "The host force buffer should be in single precision to match device data size.");
- copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM],
- &adat->f,
- adat_begin * DIM,
- adat_len * DIM,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
- /* kick off work */
- cl_error = clFlush(deviceStream.stream());
- GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
- /* After the non-local D2H is launched the nonlocal_done event can be
- recorded which signals that the local D2H can proceed. This event is not
- placed after the non-local kernel because we first need the non-local
- data back first. */
- if (iloc == InteractionLocality::NonLocal)
- {
- cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
- nb->bNonLocalStreamActive = CL_TRUE;
- }
-
- /* only transfer energies in the local stream */
- if (iloc == InteractionLocality::Local)
- {
- /* DtoH fshift when virial is needed */
- if (stepWork.computeVirial)
- {
- GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
- "Sizes of host- and device-side shift vector elements should be the same.");
- copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift),
- &adat->fshift,
- 0,
- SHIFTS * DIM,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- }
-
- /* DtoH energies */
- if (stepWork.computeEnergy)
- {
- GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
- "Sizes of host- and device-side LJ energy terms should be the same.");
- copyFromDeviceBuffer(nb->nbst.e_lj,
- &adat->e_lj,
- 0,
- 1,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
- "Sizes of host- and device-side electrostatic energy terms should be the "
- "same.");
- copyFromDeviceBuffer(nb->nbst.e_el,
- &adat->e_el,
- 0,
- 1,
- deviceStream,
- GpuApiCallBehavior::Async,
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- }
- }
-
- if (bDoTime)
- {
- t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
- }
-}
-
} // namespace Nbnxm