constexpr static int c_bufOpsThreadsPerBlock = 128;
/*! Nonbonded kernel function pointer type */
-typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const gpu_plist, bool);
+typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomData, const NBParamGpu, const gpu_plist, bool);
/*********************************/
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
if (atomLocality == AtomLocality::Local)
{
adat_begin = 0;
- adat_len = adat->natoms_local;
+ adat_len = adat->numAtomsLocal;
}
else
{
- adat_begin = adat->natoms_local;
- adat_len = adat->natoms - adat->natoms_local;
+ adat_begin = adat->numAtomsLocal;
+ adat_len = adat->numAtoms - adat->numAtomsLocal;
}
/* beginning of timed HtoD section */
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
{
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
cu_timers_t* t = nb->timers;
"beginning of the copy back function.");
/* extract the data */
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
cu_timers_t* t = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]),
+ static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]),
"Sizes of host- and device-side shift vectors should be the same.");
copyFromDeviceBuffer(
- nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]),
+ static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]),
"Sizes of host- and device-side LJ energy terms should be the same.");
copyFromDeviceBuffer(
- nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]),
+ nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
copyFromDeviceBuffer(
- nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
}
}
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const int numColumns = grid.numColumns();
const int cellOffset = grid.cellOffset();
/*! Initializes the atomdata structure first time, it only gets filled at
pair-search. */
-static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
{
- ad->ntypes = ntypes;
- allocateDeviceBuffer(&ad->shift_vec, SHIFTS, deviceContext);
- ad->bShiftVecUploaded = false;
+ ad->numTypes = ntypes;
+ allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
+ ad->shiftVecUploaded = false;
- allocateDeviceBuffer(&ad->fshift, SHIFTS, deviceContext);
- allocateDeviceBuffer(&ad->e_lj, 1, deviceContext);
- allocateDeviceBuffer(&ad->e_el, 1, deviceContext);
+ allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
+ allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
+ allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
/* initialize to nullptr poiters to data that is not allocated here and will
need reallocation in nbnxn_cuda_init_atomdata */
ad->f = nullptr;
/* size -1 indicates that the respective array hasn't been initialized yet */
- ad->natoms = -1;
- ad->nalloc = -1;
+ ad->numAtoms = -1;
+ ad->numAtomsAlloc = -1;
}
/*! Initializes the nonbonded parameter data structure. */
snew(nb->timings, 1);
/* init nbst */
- pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
- pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
- pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+ pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ));
+ pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec));
+ pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift));
init_plist(nb->plist[InteractionLocality::Local]);
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* only if we have a dynamic box */
- if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
{
- static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
+ static_assert(sizeof(adat->shiftVec[0]) == sizeof(nbatom->shift_vec[0]),
"Sizes of host- and device-side shift vectors should be the same.");
- copyToDeviceBuffer(&adat->shift_vec,
+ copyToDeviceBuffer(&adat->shiftVec,
reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
0,
SHIFTS,
localStream,
GpuApiCallBehavior::Async,
nullptr);
- adat->bShiftVecUploaded = true;
+ adat->shiftVecUploaded = true;
}
}
/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
{
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream);
}
/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
{
- cu_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
- clearDeviceBufferAsync(&adat->fshift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&adat->e_lj, 0, 1, localStream);
- clearDeviceBufferAsync(&adat->e_el, 0, 1, localStream);
+ clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+ clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
}
void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
{
- nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
+ nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms);
/* clear shift force array and energies if the outputs were
used in the current step */
if (computeVirial)
bool realloced;
bool bDoTime = nb->bDoTime;
cu_timers_t* timers = nb->timers;
- cu_atomdata_t* d_atdat = nb->atdat;
+ NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* need to reallocate if we have to copy more atoms than the amount of space
available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
- if (natoms > d_atdat->nalloc)
+ if (natoms > d_atdat->numAtomsAlloc)
{
nalloc = over_alloc_small(natoms);
/* free up first if the arrays have already been initialized */
- if (d_atdat->nalloc != -1)
+ if (d_atdat->numAtomsAlloc != -1)
{
freeDeviceBuffer(&d_atdat->f);
freeDeviceBuffer(&d_atdat->xq);
- freeDeviceBuffer(&d_atdat->atom_types);
- freeDeviceBuffer(&d_atdat->lj_comb);
+ freeDeviceBuffer(&d_atdat->atomTypes);
+ freeDeviceBuffer(&d_atdat->ljComb);
}
allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
if (useLjCombRule(nb->nbparam->vdwType))
{
- allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext);
+ allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
}
else
{
- allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext);
+ allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
}
- d_atdat->nalloc = nalloc;
- realloced = true;
+ d_atdat->numAtomsAlloc = nalloc;
+ realloced = true;
}
- d_atdat->natoms = natoms;
- d_atdat->natoms_local = nbat->natoms_local;
+ d_atdat->numAtoms = natoms;
+ d_atdat->numAtomsLocal = nbat->natoms_local;
/* need to clear GPU f output if realloc happened */
if (realloced)
if (useLjCombRule(nb->nbparam->vdwType))
{
- static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(Float2),
+ static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2),
"Size of the LJ parameters element should be equal to the size of float2.");
- copyToDeviceBuffer(&d_atdat->lj_comb,
+ copyToDeviceBuffer(&d_atdat->ljComb,
reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
0,
natoms,
}
else
{
- static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]),
+ static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]),
"Sizes of host- and device-side atom types should be the same.");
- copyToDeviceBuffer(&d_atdat->atom_types,
+ copyToDeviceBuffer(&d_atdat->atomTypes,
nbat->params().type.data(),
0,
natoms,
void gpu_free(NbnxmGpu* nb)
{
- cu_atomdata_t* atdat;
- NBParamGpu* nbparam;
-
if (nb == nullptr)
{
return;
}
- atdat = nb->atdat;
- nbparam = nb->nbparam;
+ NBAtomData* atdat = nb->atdat;
+ NBParamGpu* nbparam = nb->nbparam;
if ((!nbparam->coulomb_tab)
&& (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
}
- freeDeviceBuffer(&atdat->shift_vec);
- freeDeviceBuffer(&atdat->fshift);
+ freeDeviceBuffer(&atdat->shiftVec);
+ freeDeviceBuffer(&atdat->fShift);
- freeDeviceBuffer(&atdat->e_lj);
- freeDeviceBuffer(&atdat->e_el);
+ freeDeviceBuffer(&atdat->eLJ);
+ freeDeviceBuffer(&atdat->eElec);
freeDeviceBuffer(&atdat->f);
freeDeviceBuffer(&atdat->xq);
- freeDeviceBuffer(&atdat->atom_types);
- freeDeviceBuffer(&atdat->lj_comb);
+ freeDeviceBuffer(&atdat->atomTypes);
+ freeDeviceBuffer(&atdat->ljComb);
/* Free plist */
auto* plist = nb->plist[InteractionLocality::Local];
}
/* Free nbst */
- pfree(nb->nbst.e_lj);
- nb->nbst.e_lj = nullptr;
+ pfree(nb->nbst.eLJ);
+ nb->nbst.eLJ = nullptr;
- pfree(nb->nbst.e_el);
- nb->nbst.e_el = nullptr;
+ pfree(nb->nbst.eElec);
+ nb->nbst.eElec = nullptr;
- pfree(nb->nbst.fshift);
- nb->nbst.fshift = nullptr;
+ pfree(nb->nbst.fShift);
+ nb->nbst.fShift = nullptr;
sfree(atdat);
sfree(nbparam);
{
assert(nb);
- return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fshift);
+ return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fShift);
}
/* Initialization for X buffer operations on GPU. */
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
# endif /* CALC_ENERGIES */
#endif /* PRUNE_NBL */
- (const cu_atomdata_t atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
+ (const NBAtomData atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
#ifdef FUNCTION_DECLARATION_ONLY
; /* Only do function declaration, omit the function body. */
#else
nbnxn_cj4_t* pl_cj4 = plist.cj4;
const nbnxn_excl_t* excl = plist.excl;
# ifndef LJ_COMB
- const int* atom_types = atdat.atom_types;
- int ntypes = atdat.ntypes;
+ const int* atom_types = atdat.atomTypes;
+ int ntypes = atdat.numTypes;
# else
- const float2* lj_comb = atdat.lj_comb;
+ const float2* lj_comb = atdat.ljComb;
float2 ljcp_i, ljcp_j;
# endif
const float4* xq = atdat.xq;
float3* f = asFloat3(atdat.f);
- const float3* shift_vec = asFloat3(atdat.shift_vec);
+ const float3* shift_vec = asFloat3(atdat.shiftVec);
float rcoulomb_sq = nbparam.rcoulomb_sq;
# ifdef VDW_CUTOFF_CHECK
float rvdw_sq = nbparam.rvdw_sq;
# else
float reactionFieldShift = nbparam.c_rf;
# endif /* EL_EWALD_ANY */
- float* e_lj = atdat.e_lj;
- float* e_el = atdat.e_el;
+ float* e_lj = atdat.eLJ;
+ float* e_el = atdat.eElec;
# endif /* CALC_ENERGIES */
/* thread/block/warp id-s */
/* add up local shift forces into global mem, tidxj indexes x,y,z */
if (bCalcFshift && (tidxj & 3) < 3)
{
- float3* fshift = asFloat3(atdat.fshift);
- atomicAdd(&(fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
+ float3* fShift = asFloat3(atdat.fShift);
+ atomicAdd(&(fShift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
}
# ifdef CALC_ENERGIES
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#ifndef FUNCTION_DECLARATION_ONLY
/* Instantiate external template functions */
template __global__ void
-nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<false>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
template __global__ void
-nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<true>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
#endif
*/
template<bool haveFreshList>
__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
- void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat,
+ void nbnxn_kernel_prune_cuda(const NBAtomData atdat,
const NBParamGpu nbparam,
const Nbnxm::gpu_plist plist,
int numParts,
// Add extern declarations so each translation unit understands that
// there will be a definition provided.
extern template __global__ void
-nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<true>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
extern template __global__ void
-nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<false>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
#else
{
const nbnxn_sci_t* pl_sci = plist.sci;
nbnxn_cj4_t* pl_cj4 = plist.cj4;
const float4* xq = atdat.xq;
- const float3* shift_vec = asFloat3(atdat.shift_vec);
+ const float3* shift_vec = asFloat3(atdat.shiftVec);
float rlistOuter_sq = nbparam.rlistOuter_sq;
float rlistInner_sq = nbparam.rlistInner_sq;
/*! \brief cluster size = number of atoms per cluster. */
static constexpr int c_clSize = c_nbnxnGpuClusterSize;
-/* All structs prefixed with "cu_" hold data used in GPU calculations and
- * are passed to the kernels, except cu_timers_t. */
-/*! \cond */
-typedef struct cu_atomdata cu_atomdata_t;
-/*! \endcond */
-
-
-/** \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- * The energies/shift forces get downloaded here first, before getting added
- * to the CPU-side aggregate values.
- */
-struct nb_staging_t
-{
- //! LJ energy
- float* e_lj = nullptr;
- //! electrostatic energy
- float* e_el = nullptr;
- //! shift forces
- Float3* fshift = nullptr;
-};
-
-/** \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-struct cu_atomdata
-{
- //! number of atoms
- int natoms;
- //! number of local atoms
- int natoms_local;
- //! allocation size for the atom data (xq, f)
- int nalloc;
-
- //! atom coordinates + charges, size natoms
- DeviceBuffer<Float4> xq;
- //! force output array, size natoms
- DeviceBuffer<Float3> f;
-
- //! LJ energy output, size 1
- DeviceBuffer<float> e_lj;
- //! Electrostatics energy input, size 1
- DeviceBuffer<float> e_el;
-
- //! shift forces
- DeviceBuffer<Float3> fshift;
-
- //! number of atom types
- int ntypes;
- //! atom type indices, size natoms
- DeviceBuffer<int> atom_types;
- //! sqrt(c6),sqrt(c12) size natoms
- DeviceBuffer<Float2> lj_comb;
-
- //! shifts
- DeviceBuffer<Float3> shift_vec;
- //! true if the shift vector has been uploaded
- bool bShiftVecUploaded;
-};
-
/** \internal
* \brief Typedef of actual timer type.
*/
bool bNonLocalStreamDoneMarked = false;
/*! \brief atom data */
- cu_atomdata_t* atdat = nullptr;
+ NBAtomData* atdat = nullptr;
/*! \brief array of atom indices */
int* atomIndices = nullptr;
/*! \brief size of atom indices */
/*! \brief pair-list data structures (local and non-local) */
gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
/*! \brief staging area where fshift/energies get downloaded */
- nb_staging_t nbst;
+ NBStagingData nbst;
/*! \brief local and non-local GPU streams */
gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
if (atomLocality == AtomLocality::Local)
{
*atomRangeBegin = 0;
- *atomRangeLen = atomData->natoms_local;
+ *atomRangeLen = atomData->numAtomsLocal;
}
else
{
- *atomRangeBegin = atomData->natoms_local;
- *atomRangeLen = atomData->natoms - atomData->natoms_local;
+ *atomRangeBegin = atomData->numAtomsLocal;
+ *atomRangeLen = atomData->numAtoms - atomData->numAtomsLocal;
}
}
* Note that this function should always be called after the transfers into the
* staging buffers has completed.
*
- * \tparam StagingData Type of staging data
* \param[in] nbst Nonbonded staging data
* \param[in] iLocality Interaction locality specifier
* \param[in] reduceEnergies True if energy reduction should be done
* \param[out] e_el Variable to accumulate electrostatic energy into
* \param[out] fshift Pointer to the array of shift forces to accumulate into
*/
-static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst,
+static inline void gpu_reduce_staged_outputs(const NBStagingData& nbst,
const InteractionLocality iLocality,
const bool reduceEnergies,
const bool reduceFshift,
{
if (reduceEnergies)
{
- *e_lj += *nbst.e_lj;
- *e_el += *nbst.e_el;
+ *e_lj += *nbst.eLJ;
+ *e_el += *nbst.eElec;
}
if (reduceFshift)
{
for (int i = 0; i < SHIFTS; i++)
{
- rvec_inc(fshift[i], nbst.fshift[i]);
+ rvec_inc(fshift[i], nbst.fShift[i]);
}
}
}
//! Default for the prune kernel's j4 processing concurrency.
static constexpr int c_pruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * Since SYCL buffers already have host-side storage, this is a bit redundant.
+ * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
+ */
+struct NBStagingData
+{
+ //! LJ energy
+ float* eLJ = nullptr;
+ //! electrostatic energy
+ float* eElec = nullptr;
+ //! shift forces
+ Float3* fShift = nullptr;
+};
+
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+struct NBAtomData
+{
+ //! number of atoms
+ int numAtoms;
+ //! number of local atoms
+ int numAtomsLocal;
+ //! allocation size for the atom data (xq, f)
+ int numAtomsAlloc;
+
+ //! atom coordinates + charges, size \ref numAtoms
+ DeviceBuffer<Float4> xq;
+ //! force output array, size \ref numAtoms
+ DeviceBuffer<Float3> f;
+
+ //! LJ energy output, size 1
+ DeviceBuffer<float> eLJ;
+ //! Electrostatics energy input, size 1
+ DeviceBuffer<float> eElec;
+
+ //! shift forces
+ DeviceBuffer<Float3> fShift;
+
+ //! number of atom types
+ int numTypes;
+ //! atom type indices, size \ref numAtoms
+ DeviceBuffer<int> atomTypes;
+ //! sqrt(c6),sqrt(c12) size \ref numAtoms
+ DeviceBuffer<Float2> ljComb;
+
+ //! shifts
+ DeviceBuffer<Float3> shiftVec;
+ //! true if the shift vector has been uploaded
+ bool shiftVecUploaded;
+};
+
/** \internal
* \brief Parameters required for the GPU nonbonded calculations.
*/
/* local/nonlocal offset and length used for xq and f */
int adat_begin, adat_len;
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
if (atomLocality == AtomLocality::Local)
{
adat_begin = 0;
- adat_len = adat->natoms_local;
+ adat_len = adat->numAtomsLocal;
}
else
{
- adat_begin = adat->natoms_local;
- adat_len = adat->natoms - adat->natoms_local;
+ adat_begin = adat->numAtomsLocal;
+ adat_len = adat->numAtoms - adat->numAtomsLocal;
}
/* beginning of timed HtoD section */
*/
void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
{
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
&nbparams_params,
&adat->xq,
&adat->f,
- &adat->e_lj,
- &adat->e_el,
- &adat->fshift,
- &adat->lj_comb,
- &adat->shift_vec,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->ljComb,
+ &adat->shiftVec,
&nbp->nbfp,
&nbp->nbfp_comb,
&nbp->coulomb_tab,
{
const auto kernelArgs = prepareGpuKernelArguments(kernel,
config,
- &adat->ntypes,
+ &adat->numTypes,
&nbparams_params,
&adat->xq,
&adat->f,
- &adat->e_lj,
- &adat->e_el,
- &adat->fshift,
- &adat->atom_types,
- &adat->shift_vec,
+ &adat->eLJ,
+ &adat->eElec,
+ &adat->fShift,
+ &adat->atomTypes,
+ &adat->shiftVec,
&nbp->nbfp,
&nbp->nbfp_comb,
&nbp->coulomb_tab,
*/
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
cl_timers_t* t = nb->timers;
config,
&nbparams_params,
&adat->xq,
- &adat->shift_vec,
+ &adat->shiftVec,
&plist->sci,
&plist->cj4,
&plist->imask,
"Non-local stream is indicating that the copy back event is enqueued at the "
"beginning of the copy back function.");
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
cl_timers_t* t = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
if (stepWork.computeVirial)
{
static_assert(
- sizeof(*nb->nbst.fshift) == sizeof(Float3),
+ sizeof(*nb->nbst.fShift) == sizeof(Float3),
"Sizes of host- and device-side shift vector elements should be the same.");
- copyFromDeviceBuffer(nb->nbst.fshift,
- &adat->fshift,
+ copyFromDeviceBuffer(nb->nbst.fShift,
+ &adat->fShift,
0,
SHIFTS,
deviceStream,
/* DtoH energies */
if (stepWork.computeEnergy)
{
- static_assert(sizeof(*nb->nbst.e_lj) == sizeof(float),
+ static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
"Sizes of host- and device-side LJ energy terms should be the same.");
- copyFromDeviceBuffer(nb->nbst.e_lj,
- &adat->e_lj,
+ copyFromDeviceBuffer(nb->nbst.eLJ,
+ &adat->eLJ,
0,
1,
deviceStream,
GpuApiCallBehavior::Async,
bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
- static_assert(sizeof(*nb->nbst.e_el) == sizeof(float),
+ static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
- copyFromDeviceBuffer(nb->nbst.e_el,
- &adat->e_el,
+ copyFromDeviceBuffer(nb->nbst.eElec,
+ &adat->eElec,
0,
1,
deviceStream,
/*! \brief Initializes the atomdata structure first time, it only gets filled at
pair-search.
*/
-static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
{
- ad->ntypes = ntypes;
+ ad->numTypes = ntypes;
- allocateDeviceBuffer(&ad->shift_vec, SHIFTS * DIM, deviceContext);
- ad->bShiftVecUploaded = CL_FALSE;
+ allocateDeviceBuffer(&ad->shiftVec, SHIFTS * DIM, deviceContext);
+ ad->shiftVecUploaded = false;
- allocateDeviceBuffer(&ad->fshift, SHIFTS * DIM, deviceContext);
- allocateDeviceBuffer(&ad->e_lj, 1, deviceContext);
- allocateDeviceBuffer(&ad->e_el, 1, deviceContext);
+ allocateDeviceBuffer(&ad->fShift, SHIFTS * DIM, deviceContext);
+ allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
+ allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
/* initialize to nullptr pointers to data that is not allocated here and will
need reallocation in nbnxn_gpu_init_atomdata */
ad->f = nullptr;
/* size -1 indicates that the respective array hasn't been initialized yet */
- ad->natoms = -1;
- ad->nalloc = -1;
+ ad->numAtoms = -1;
+ ad->numAtomsAlloc = -1;
}
{
cl_int cl_error;
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
size_t local_work_size[3] = { 1, 1, 1 };
global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
arg_no = 0;
- cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
- cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
- cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+ cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec));
cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
* Initializes members of the atomdata and nbparam structs and
* clears e/fshift output buffers.
*/
-static void nbnxn_ocl_init_const(cl_atomdata_t* atomData,
+static void nbnxn_ocl_init_const(NBAtomData* atomData,
NBParamGpu* nbParams,
const interaction_const_t* ic,
const PairlistParams& listParams,
nb->dev_rundata = new gmx_device_runtime_data_t();
/* init nbst */
- pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
init_plist(nb->plist[InteractionLocality::Local]);
return;
}
- cl_atomdata_t* atomData = nb->atdat;
+ NBAtomData* atomData = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream);
//! This function is documented in the header file
void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
{
- nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+ nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms);
/* clear shift force array and energies if the outputs were
used in the current step */
if (computeVirial)
//! This function is documented in the header file
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
- cl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* only if we have a dynamic box */
- if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
{
static_assert(sizeof(Float3) == sizeof(nbatom->shift_vec[0]),
"Sizes of host- and device-side shift vectors should be the same.");
- copyToDeviceBuffer(&adat->shift_vec,
+ copyToDeviceBuffer(&adat->shiftVec,
reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
0,
SHIFTS,
localStream,
GpuApiCallBehavior::Async,
nullptr);
- adat->bShiftVecUploaded = CL_TRUE;
+ adat->shiftVecUploaded = true;
}
}
bool realloced;
bool bDoTime = nb->bDoTime;
cl_timers_t* timers = nb->timers;
- cl_atomdata_t* d_atdat = nb->atdat;
+ NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* need to reallocate if we have to copy more atoms than the amount of space
available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
- if (natoms > d_atdat->nalloc)
+ if (natoms > d_atdat->numAtomsAlloc)
{
nalloc = over_alloc_small(natoms);
/* free up first if the arrays have already been initialized */
- if (d_atdat->nalloc != -1)
+ if (d_atdat->numAtomsAlloc != -1)
{
freeDeviceBuffer(&d_atdat->f);
freeDeviceBuffer(&d_atdat->xq);
- freeDeviceBuffer(&d_atdat->lj_comb);
- freeDeviceBuffer(&d_atdat->atom_types);
+ freeDeviceBuffer(&d_atdat->ljComb);
+ freeDeviceBuffer(&d_atdat->atomTypes);
}
if (useLjCombRule(nb->nbparam->vdwType))
{
// Two Lennard-Jones parameters per atom
- allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext);
+ allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
}
else
{
- allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext);
+ allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
}
- d_atdat->nalloc = nalloc;
- realloced = true;
+ d_atdat->numAtomsAlloc = nalloc;
+ realloced = true;
}
- d_atdat->natoms = natoms;
- d_atdat->natoms_local = nbat->natoms_local;
+ d_atdat->numAtoms = natoms;
+ d_atdat->numAtomsLocal = nbat->natoms_local;
/* need to clear GPU f output if realloc happened */
if (realloced)
{
static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
"Size of the LJ parameters element should be equal to the size of float2.");
- copyToDeviceBuffer(&d_atdat->lj_comb,
+ copyToDeviceBuffer(&d_atdat->ljComb,
reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
0,
natoms,
{
static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
"Sizes of host- and device-side atom types should be the same.");
- copyToDeviceBuffer(&d_atdat->atom_types,
+ copyToDeviceBuffer(&d_atdat->atomTypes,
nbat->params().type.data(),
0,
natoms,
/* Free atdat */
freeDeviceBuffer(&(nb->atdat->xq));
freeDeviceBuffer(&(nb->atdat->f));
- freeDeviceBuffer(&(nb->atdat->e_lj));
- freeDeviceBuffer(&(nb->atdat->e_el));
- freeDeviceBuffer(&(nb->atdat->fshift));
- freeDeviceBuffer(&(nb->atdat->lj_comb));
- freeDeviceBuffer(&(nb->atdat->atom_types));
- freeDeviceBuffer(&(nb->atdat->shift_vec));
+ freeDeviceBuffer(&(nb->atdat->eLJ));
+ freeDeviceBuffer(&(nb->atdat->eElec));
+ freeDeviceBuffer(&(nb->atdat->fShift));
+ freeDeviceBuffer(&(nb->atdat->ljComb));
+ freeDeviceBuffer(&(nb->atdat->atomTypes));
+ freeDeviceBuffer(&(nb->atdat->shiftVec));
sfree(nb->atdat);
/* Free nbparam */
}
/* Free nbst */
- pfree(nb->nbst.e_lj);
- nb->nbst.e_lj = nullptr;
+ pfree(nb->nbst.eLJ);
+ nb->nbst.eLJ = nullptr;
- pfree(nb->nbst.e_el);
- nb->nbst.e_el = nullptr;
+ pfree(nb->nbst.eElec);
+ nb->nbst.eElec = nullptr;
- pfree(nb->nbst.fshift);
- nb->nbst.fshift = nullptr;
+ pfree(nb->nbst.fShift);
+ nb->nbst.fShift = nullptr;
freeGpuProgram(nb->dev_rundata->program);
delete nb->dev_rundata;
ePruneNR
};
-/*! \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- * The energies/shift forces get downloaded here first, before getting added
- * to the CPU-side aggregate values.
- */
-struct nb_staging_t
-{
- //! LJ energy
- float* e_lj = nullptr;
- //! electrostatic energy
- float* e_el = nullptr;
- //! float3 buffer with shift forces
- Float3* fshift = nullptr;
-};
-
-/*! \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-typedef struct cl_atomdata
-{
- //! number of atoms
- int natoms;
- //! number of local atoms
- int natoms_local;
- //! allocation size for the atom data (xq, f)
- int nalloc;
-
- //! float4 buffer with atom coordinates + charges, size natoms
- DeviceBuffer<Float4> xq;
-
- //! float3 buffer with force output array, size natoms
- DeviceBuffer<Float3> f;
-
- //! LJ energy output, size 1
- DeviceBuffer<float> e_lj;
- //! Electrostatics energy input, size 1
- DeviceBuffer<float> e_el;
-
- //! float3 buffer with shift forces
- DeviceBuffer<Float3> fshift;
-
- //! number of atom types
- int ntypes;
- //! int buffer with atom type indices, size natoms
- DeviceBuffer<int> atom_types;
- //! float2 buffer with sqrt(c6),sqrt(c12), size natoms
- DeviceBuffer<Float2> lj_comb;
-
- //! float3 buffer with shifts values
- DeviceBuffer<Float3> shift_vec;
-
- //! true if the shift vector has been uploaded
- bool bShiftVecUploaded;
-} cl_atomdata_t;
-
/*! \internal
* \brief Data structure shared between the OpenCL device code and OpenCL host code
*
bool bNonLocalStreamDoneMarked = false;
//! atom data
- cl_atomdata_t* atdat = nullptr;
+ NBAtomData* atdat = nullptr;
//! parameters required for the non-bonded calc.
NBParamGpu* nbparam = nullptr;
//! pair-list data structures (local and non-local)
gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { nullptr };
//! staging area where fshift/energies get downloaded
- nb_staging_t nbst;
+ NBStagingData nbst;
//! local and non-local GPU queues
gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
"beginning of the copy back function.");
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(),
+ GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(),
"Sizes of host- and device-side shift vector elements should be the same.");
copyFromDeviceBuffer(
- nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+ GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float),
"Sizes of host- and device-side LJ energy terms should be the same.");
copyFromDeviceBuffer(
- nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
- GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+ nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
copyFromDeviceBuffer(
- nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+ nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
}
}
}
const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
//! This function is documented in the header file
void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
{
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
// Clear forces
- clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream);
+ clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
// Clear shift force array and energies if the outputs were used in the current step
if (computeVirial)
{
static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
{
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
- sycl_atomdata_t* atomdata = nb->atdat;
+ NBAtomData* atomdata = nb->atdat;
atomdata->numTypes = numTypes;
allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
atomdata->shiftVecUploaded = false;
atomdata->f = nullptr;
/* size -1 indicates that the respective array hasn't been initialized yet */
- atomdata->natoms = -1;
- atomdata->numAlloc = -1;
+ atomdata->numAtoms = -1;
+ atomdata->numAtomsAlloc = -1;
}
/*! \brief Initialize the nonbonded parameter data structure. */
{
auto* nb = new NbnxmGpu();
nb->deviceContext_ = &deviceStreamManager.context();
- nb->atdat = new sycl_atomdata_t;
+ nb->atdat = new NBAtomData;
nb->nbparam = new NBParamGpu;
nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
if (bLocalAndNonlocal)
nb->timings = nullptr;
/* init nbst */
- pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
init_plist(nb->plist[InteractionLocality::Local]);
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* only if we have a dynamic box */
void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
{
GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
- sycl_atomdata_t* atdat = nb->atdat;
+ NBAtomData* atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
int numAtoms = nbat->numAtoms();
bool reallocated = false;
- if (numAtoms > atdat->numAlloc)
+ if (numAtoms > atdat->numAtomsAlloc)
{
int numAlloc = over_alloc_small(numAtoms);
/* free up first if the arrays have already been initialized */
- if (atdat->numAlloc != -1)
+ if (atdat->numAtomsAlloc != -1)
{
freeDeviceBuffer(&atdat->f);
freeDeviceBuffer(&atdat->xq);
allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
}
- atdat->numAlloc = numAlloc;
- reallocated = true;
+ atdat->numAtomsAlloc = numAlloc;
+ reallocated = true;
}
- atdat->natoms = numAtoms;
- atdat->natoms_local = nbat->natoms_local;
+ atdat->numAtoms = numAtoms;
+ atdat->numAtomsLocal = nbat->natoms_local;
/* need to clear GPU f output if realloc happened */
if (reallocated)
{
- clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream);
+ clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
}
if (useLjCombRule(nb->nbparam->vdwType))
return;
}
- sycl_atomdata_t* atdat = nb->atdat;
- NBParamGpu* nbparam = nb->nbparam;
+ NBAtomData* atdat = nb->atdat;
+ NBParamGpu* nbparam = nb->nbparam;
if ((!nbparam->coulomb_tab)
&& (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
}
/* Free nbst */
- pfree(nb->nbst.e_lj);
- nb->nbst.e_lj = nullptr;
+ pfree(nb->nbst.eLJ);
+ nb->nbst.eLJ = nullptr;
- pfree(nb->nbst.e_el);
- nb->nbst.e_el = nullptr;
+ pfree(nb->nbst.eElec);
+ nb->nbst.eElec = nullptr;
- pfree(nb->nbst.fshift);
- nb->nbst.fshift = nullptr;
+ pfree(nb->nbst.fShift);
+ nb->nbst.fShift = nullptr;
delete atdat;
delete nbparam;
void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
{
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
const bool doPruneNBL = (plist->haveFreshList && !nb->didPrune[iloc]);
const int part,
const int numSciInPart)
{
- sycl_atomdata_t* adat = nb->atdat;
+ NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
const bool haveFreshList = plist->haveFreshList;
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/enumerationhelpers.h"
-/*! \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- * Since SYCL buffers already have host-side storage, this is a bit redundant.
- * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
- */
-struct nb_staging_t
-{
- //! LJ energy
- float* e_lj = nullptr;
- //! electrostatic energy
- float* e_el = nullptr;
- //! shift forces
- Float3* fshift = nullptr;
-};
-
-/** \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-struct sycl_atomdata_t
-{
- //! number of atoms
- int natoms;
- //! number of local atoms
- int natoms_local; //
- //! allocation size for the atom data (xq, f)
- int numAlloc;
-
- //! atom coordinates + charges, size \ref natoms
- DeviceBuffer<Float4> xq;
- //! force output array, size \ref natoms
- DeviceBuffer<Float3> f;
-
- //! LJ energy output, size 1
- DeviceBuffer<float> eLJ;
- //! Electrostatics energy input, size 1
- DeviceBuffer<float> eElec;
-
- //! shift forces
- DeviceBuffer<Float3> fShift;
-
- //! number of atom types
- int numTypes;
- //! atom type indices, size \ref natoms
- DeviceBuffer<int> atomTypes;
- //! sqrt(c6),sqrt(c12) size \ref natoms
- DeviceBuffer<Float2> ljComb;
-
- //! shifts
- DeviceBuffer<Float3> shiftVec;
- //! true if the shift vector has been uploaded
- bool shiftVecUploaded;
-};
-
class GpuEventSynchronizer;
/*! \internal
/*! \brief true indicates that the nonlocal_done event was marked */
bool bNonLocalStreamDoneMarked = false;
/*! \brief atom data */
- sycl_atomdata_t* atdat = nullptr;
+ NBAtomData* atdat = nullptr;
NBParamGpu* nbparam = nullptr;
/*! \brief pair-list data structures (local and non-local) */
gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
/*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */
- nb_staging_t nbst;
+ NBStagingData nbst;
/*! \brief local and non-local GPU streams */
gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;