/* We always re-initialize the tables whether they are used or not */
init_interaction_const_tables(nullptr, ic, set->rlistOuter, ir.tabext);
- Nbnxm::gpu_pme_loadbal_update_param(nbv, ic);
+ Nbnxm::gpu_pme_loadbal_update_param(nbv, *ic);
if (!pme_lb->bSepPMERanks)
{
// TODO Remove this comment when the above order issue is resolved
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/device_context.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/pmalloc.h"
*/
static unsigned int gpu_min_ci_balanced_factor = 44;
-/*! Initializes the atomdata structure first time, it only gets filled at
- pair-search. */
-static void init_atomdata_first(NBAtomData* ad,
- int nTypes,
- const DeviceContext& deviceContext,
- const DeviceStream& localStream)
+void gpu_init_platform_specific(NbnxmGpu* /* nb */)
{
- ad->numTypes = nTypes;
- allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
- ad->shiftVecUploaded = false;
-
- allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
- allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
- allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
-
- clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
- clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
-
- /* initialize to nullptr poiters to data that is not allocated here and will
- need reallocation in nbnxn_cuda_init_atomdata */
- ad->xq = nullptr;
- ad->f = nullptr;
-
- /* size -1 indicates that the respective array hasn't been initialized yet */
- ad->numAtoms = -1;
- ad->numAtomsAlloc = -1;
-}
-
-/*! Initializes the nonbonded parameter data structure. */
-static void init_nbparam(NBParamGpu* nbp,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t::Params& nbatParams,
- const DeviceContext& deviceContext)
-{
- const int ntypes = nbatParams.numTypes;
-
- set_cutoff_parameters(nbp, ic, listParams);
-
- /* The kernel code supports LJ combination rules (geometric and LB) for
- * all kernel types, but we only generate useful combination rule kernels.
- * We currently only use LJ combination rule (geometric and LB) kernels
- * for plain cut-off LJ. On Maxwell the force only kernels speed up 15%
- * with PME and 20% with RF, the other kernels speed up about half as much.
- * For LJ force-switch the geometric rule would give 7% speed-up, but this
- * combination is rarely used. LJ force-switch with LB rule is more common,
- * but gives only 1% speed-up.
- */
- nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
- nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
-
- /* generate table for PME */
- nbp->coulomb_tab = nullptr;
- if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
- {
- GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
- init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
- }
-
- /* set up LJ parameter lookup table */
- if (!useLjCombRule(nbp->vdwType))
- {
- static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
- "Mismatch in the size of host / device data types");
- initParamLookupTable(&nbp->nbfp,
- &nbp->nbfp_texobj,
- reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
- ntypes * ntypes,
- deviceContext);
- }
-
- /* set up LJ-PME parameter lookup table */
- if (ic->vdwtype == VanDerWaalsType::Pme)
- {
- static_assert(sizeof(decltype(nbp->nbfp_comb))
- == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
- "Mismatch in the size of host / device data types");
- initParamLookupTable(&nbp->nbfp_comb,
- &nbp->nbfp_comb_texobj,
- reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
- ntypes,
- deviceContext);
- }
-}
-
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t* nbat,
- bool bLocalAndNonlocal)
-{
- auto nb = new NbnxmGpu();
- nb->deviceContext_ = &deviceStreamManager.context();
- snew(nb->atdat, 1);
- snew(nb->nbparam, 1);
- snew(nb->plist[InteractionLocality::Local], 1);
- if (bLocalAndNonlocal)
- {
- snew(nb->plist[InteractionLocality::NonLocal], 1);
- }
-
- nb->bUseTwoStreams = bLocalAndNonlocal;
-
- nb->timers = new Nbnxm::GpuTimers();
- snew(nb->timings, 1);
-
- /* init nbst */
- pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ));
- pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec));
- pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift));
-
- init_plist(nb->plist[InteractionLocality::Local]);
-
- /* local/non-local GPU streams */
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
- "Local non-bonded stream should be initialized to use GPU for non-bonded.");
- const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
- nb->deviceStreams[InteractionLocality::Local] = &localStream;
- if (nb->bUseTwoStreams)
- {
- init_plist(nb->plist[InteractionLocality::NonLocal]);
-
- /* Note that the device we're running on does not have to support
- * priorities, because we are querying the priority range which in this
- * case will be a single value.
- */
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
- "Non-local non-bonded stream should be initialized to use GPU for "
- "non-bonded with domain decomposition.");
- nb->deviceStreams[InteractionLocality::NonLocal] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
- ;
- }
-
- /* WARNING: CUDA timings are incorrect with multiple streams.
- * This is the main reason why they are disabled by default.
- */
- // TODO: Consider turning on by default when we can detect nr of streams.
- nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-
- if (nb->bDoTime)
- {
- init_timings(nb->timings);
- }
-
/* set the kernel type for the current GPU */
/* pick L1 cache configuration */
cuda_set_cacheconfig();
-
- const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
- const DeviceContext& deviceContext = *nb->deviceContext_;
- init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
- init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
-
- nb->atomIndicesSize = 0;
- nb->atomIndicesSize_alloc = 0;
- nb->ncxy_na = 0;
- nb->ncxy_na_alloc = 0;
- nb->ncxy_ind = 0;
- nb->ncxy_ind_alloc = 0;
-
- if (debug)
- {
- fprintf(debug, "Initialized CUDA data structures.\n");
- }
-
- return nb;
}
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
return;
}
+ delete nb->timers;
+ sfree(nb->timings);
+
NBAtomData* atdat = nb->atdat;
NBParamGpu* nbparam = nb->nbparam;
- if ((!nbparam->coulomb_tab)
- && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+ if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
{
destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
}
- delete nb->timers;
-
if (!useLjCombRule(nb->nbparam->vdwType))
{
destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
freeDeviceBuffer(&atdat->f);
freeDeviceBuffer(&atdat->xq);
- freeDeviceBuffer(&atdat->atomTypes);
- freeDeviceBuffer(&atdat->ljComb);
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ freeDeviceBuffer(&atdat->ljComb);
+ }
+ else
+ {
+ freeDeviceBuffer(&atdat->atomTypes);
+ }
/* Free plist */
auto* plist = nb->plist[InteractionLocality::Local];
freeDeviceBuffer(&plist->cj4);
freeDeviceBuffer(&plist->imask);
freeDeviceBuffer(&plist->excl);
- sfree(plist);
+ delete plist;
if (nb->bUseTwoStreams)
{
auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
freeDeviceBuffer(&plist_nl->cj4);
freeDeviceBuffer(&plist_nl->imask);
freeDeviceBuffer(&plist_nl->excl);
- sfree(plist_nl);
+ delete plist_nl;
}
/* Free nbst */
pfree(nb->nbst.fShift);
nb->nbst.fShift = nullptr;
- sfree(atdat);
- sfree(nbparam);
- sfree(nb->timings);
+ delete atdat;
+ delete nbparam;
delete nb;
if (debug)
*/
GPU_FUNC_QUALIFIER
void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused* nbv,
- const interaction_const_t gmx_unused* ic) GPU_FUNC_TERM;
+ const interaction_const_t gmx_unused& ic) GPU_FUNC_TERM;
/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
GPU_FUNC_QUALIFIER
/** Return the enum value of electrostatics kernel type for given interaction parameters \p ic. */
GPU_FUNC_QUALIFIER
-enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t gmx_unused* ic,
+enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t gmx_unused& ic,
const DeviceInformation gmx_unused& deviceInfo)
GPU_FUNC_TERM_WITH_RETURN(ElecType::Count);
/** Return the enum value of VdW kernel type for given \p ic and \p combRule. */
GPU_FUNC_QUALIFIER
-enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t gmx_unused* ic,
+enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t gmx_unused& ic,
LJCombinationRule gmx_unused ljCombinationRule)
GPU_FUNC_TERM_WITH_RETURN(VdwType::Count);
#include "nbnxm_gpu_data_mgmt.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
+#include "gromacs/gpu_utils/pmalloc.h"
#include "gromacs/hardware/device_information.h"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/simulation_workload.h"
}
}
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
+void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
{
- nbp->ewald_beta = ic->ewaldcoeff_q;
- nbp->sh_ewald = ic->sh_ewald;
- nbp->epsfac = ic->epsfac;
- nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
- nbp->c_rf = ic->reactionFieldShift;
- nbp->rvdw_sq = ic->rvdw * ic->rvdw;
- nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+ nbp->ewald_beta = ic.ewaldcoeff_q;
+ nbp->sh_ewald = ic.sh_ewald;
+ nbp->epsfac = ic.epsfac;
+ nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
+ nbp->c_rf = ic.reactionFieldShift;
+ nbp->rvdw_sq = ic.rvdw * ic.rvdw;
+ nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
nbp->useDynamicPruning = listParams.useDynamicPruning;
- nbp->sh_lj_ewald = ic->sh_lj_ewald;
- nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
+ nbp->sh_lj_ewald = ic.sh_lj_ewald;
+ nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
- nbp->rvdw_switch = ic->rvdw_switch;
- nbp->dispersion_shift = ic->dispersion_shift;
- nbp->repulsion_shift = ic->repulsion_shift;
- nbp->vdw_switch = ic->vdw_switch;
+ nbp->rvdw_switch = ic.rvdw_switch;
+ nbp->dispersion_shift = ic.dispersion_shift;
+ nbp->repulsion_shift = ic.repulsion_shift;
+ nbp->vdw_switch = ic.vdw_switch;
}
-void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
{
if (!nbv || !nbv->useGpu())
{
set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
- nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
+ nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
- GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
- init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
+ GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
}
void init_plist(gpu_plist* pl)
t->dynamicPruneTime.t = 0.0;
}
+/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
+static void initAtomdataFirst(NBAtomData* atomdata,
+ int numTypes,
+ const DeviceContext& deviceContext,
+ const DeviceStream& localStream)
+{
+ atomdata->numTypes = numTypes;
+ allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
+ atomdata->shiftVecUploaded = false;
+
+ allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
+ allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
+ allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
+
+ clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
+ clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
+ clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
+
+ /* initialize to nullptr pointers to data that is not allocated here and will
+ need reallocation in later */
+ atomdata->xq = nullptr;
+ atomdata->f = nullptr;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ atomdata->numAtoms = -1;
+ atomdata->numAtomsAlloc = -1;
+}
+
+/*! \brief Initialize the nonbonded parameter data structure. */
+static void initNbparam(NBParamGpu* nbp,
+ const interaction_const_t& ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t::Params& nbatParams,
+ const DeviceContext& deviceContext)
+{
+ const int numTypes = nbatParams.numTypes;
+
+ set_cutoff_parameters(nbp, ic, listParams);
+
+ nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
+ nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
+
+ if (ic.vdwtype == VanDerWaalsType::Pme)
+ {
+ if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
+ {
+ GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
+ "Combination rule mismatch!");
+ }
+ else
+ {
+ GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
+ "Combination rule mismatch!");
+ }
+ }
+
+ /* generate table for PME */
+ if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
+ {
+ GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
+ }
+ else
+ {
+ // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+ allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
+ }
+
+ /* set up LJ parameter lookup table */
+ if (!useLjCombRule(nbp->vdwType))
+ {
+ static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp,
+ &nbp->nbfp_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+ numTypes * numTypes,
+ deviceContext);
+ }
+ else
+ {
+ // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+ allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
+ }
+
+ /* set up LJ-PME parameter lookup table */
+ if (ic.vdwtype == VanDerWaalsType::Pme)
+ {
+ static_assert(sizeof(decltype(nbp->nbfp_comb))
+ == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp_comb,
+ &nbp->nbfp_comb_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+ numTypes,
+ deviceContext);
+ }
+ else
+ {
+ // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+ allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
+ }
+}
+
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+ const interaction_const_t* ic,
+ const PairlistParams& listParams,
+ const nbnxn_atomdata_t* nbat,
+ const bool bLocalAndNonlocal)
+{
+ auto* nb = new NbnxmGpu();
+ nb->deviceContext_ = &deviceStreamManager.context();
+ nb->atdat = new NBAtomData;
+ nb->nbparam = new NBParamGpu;
+ nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
+ if (bLocalAndNonlocal)
+ {
+ nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
+ }
+
+ nb->bUseTwoStreams = bLocalAndNonlocal;
+
+ GMX_ASSERT(!(GMX_GPU_SYCL && !nb->bDoTime), "GPU timing is not supported in SYCL");
+ nb->timers = new Nbnxm::GpuTimers();
+ snew(nb->timings, 1);
+
+ /* WARNING: CUDA timings are incorrect with multiple streams.
+ * This is the main reason why they are disabled by default.
+ * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
+ * TODO: Consider turning on by default when we can detect nr of streams.
+ *
+ * OpenCL timing is enabled by default and can be disabled by
+ * GMX_DISABLE_GPU_TIMING environment variable.
+ *
+ * Timing is disabled in SYCL.
+ */
+ nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
+ || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
+
+ if (nb->bDoTime)
+ {
+ init_timings(nb->timings);
+ }
+
+ /* init nbst */
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
+
+ init_plist(nb->plist[InteractionLocality::Local]);
+
+ /* local/non-local GPU streams */
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+ "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+ const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+ nb->deviceStreams[InteractionLocality::Local] = &localStream;
+ // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
+ // out-of-order. But for the time being, it will be less disruptive to keep them.
+ if (nb->bUseTwoStreams)
+ {
+ init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+ GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+ "Non-local non-bonded stream should be initialized to use GPU for "
+ "non-bonded with domain decomposition.");
+ nb->deviceStreams[InteractionLocality::NonLocal] =
+ &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+ }
+
+ const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
+ const DeviceContext& deviceContext = *nb->deviceContext_;
+
+ initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
+ initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+
+ gpu_init_platform_specific(nb);
+
+ if (debug)
+ {
+ fprintf(debug, "Initialized NBNXM GPU data structures.\n");
+ }
+
+ return nb;
+}
+
//! This function is documented in the header file
void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
{
{
freeDeviceBuffer(&atdat->f);
freeDeviceBuffer(&atdat->xq);
- freeDeviceBuffer(&atdat->ljComb);
- freeDeviceBuffer(&atdat->atomTypes);
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ freeDeviceBuffer(&atdat->ljComb);
+ }
+ else
+ {
+ freeDeviceBuffer(&atdat->atomTypes);
+ }
}
|| (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
}
-enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
+enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
const DeviceInformation& deviceInfo)
{
- if (ic->eeltype == CoulombInteractionType::Cut)
+ if (ic.eeltype == CoulombInteractionType::Cut)
{
return ElecType::Cut;
}
- else if (EEL_RF(ic->eeltype))
+ else if (EEL_RF(ic.eeltype))
{
return ElecType::RF;
}
- else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
+ else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
{
- return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
+ return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
}
else
{
GMX_THROW(gmx::InconsistentInputError(
gmx::formatString("The requested electrostatics type %s is not implemented in "
"the GPU accelerated kernels!",
- enumValueToString(ic->eeltype))));
+ enumValueToString(ic.eeltype))));
}
}
-enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
+enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
{
- if (ic->vdwtype == VanDerWaalsType::Cut)
+ if (ic.vdwtype == VanDerWaalsType::Cut)
{
- switch (ic->vdw_modifier)
+ switch (ic.vdw_modifier)
{
case InteractionModifiers::None:
case InteractionModifiers::PotShift:
GMX_THROW(gmx::InconsistentInputError(
gmx::formatString("The requested VdW interaction modifier %s is not "
"implemented in the GPU accelerated kernels!",
- enumValueToString(ic->vdw_modifier))));
+ enumValueToString(ic.vdw_modifier))));
}
}
- else if (ic->vdwtype == VanDerWaalsType::Pme)
+ else if (ic.vdwtype == VanDerWaalsType::Pme)
{
- if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
+ if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
{
assert(ljCombinationRule == LJCombinationRule::Geometric);
return VdwType::EwaldGeom;
{
GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
"The requested VdW type %s is not implemented in the GPU accelerated kernels!",
- enumValueToString(ic->vdwtype))));
+ enumValueToString(ic.vdwtype))));
}
}
#ifndef GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
#define GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
+class DeviceContext;
struct interaction_const_t;
struct NBParamGpu;
struct PairlistParams;
/*! \brief Copies all parameters related to the cut-off from ic to nbp
*/
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams);
+void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams);
/*! \brief Initializes the pair list data structure.
*/
/*! \brief Initializes the timings data structure. */
void init_timings(gmx_wallclock_gpu_nbnxn_t* t);
+void gpu_init_platform_specific(NbnxmGpu* nb);
+
} // namespace Nbnxm
#endif // GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
#include <cmath>
-#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/pmalloc.h"
#include "gromacs/hardware/device_information.h"
#include "gromacs/hardware/device_management.h"
*/
static unsigned int gpu_min_ci_balanced_factor = 50;
-
-/*! \brief Initializes the atomdata structure first time, it only gets filled at
- pair-search.
- */
-static void init_atomdata_first(NBAtomData* ad,
- int ntypes,
- const DeviceContext& deviceContext,
- const DeviceStream& localStream)
-{
- ad->numTypes = ntypes;
-
- allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
- ad->shiftVecUploaded = false;
-
- allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
- allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
- allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
-
- clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
- clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
-
- /* initialize to nullptr pointers to data that is not allocated here and will
- need reallocation in nbnxn_gpu_init_atomdata */
- ad->xq = nullptr;
- ad->f = nullptr;
-
- /* size -1 indicates that the respective array hasn't been initialized yet */
- ad->numAtoms = -1;
- ad->numAtomsAlloc = -1;
-}
-
-
-/*! \brief Initializes the nonbonded parameter data structure.
- */
-static void init_nbparam(NBParamGpu* nbp,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t::Params& nbatParams,
- const DeviceContext& deviceContext)
-{
- set_cutoff_parameters(nbp, ic, listParams);
-
- nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
- nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
-
- if (ic->vdwtype == VanDerWaalsType::Pme)
- {
- if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
- {
- GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
- "Combination rule mismatch!");
- }
- else
- {
- GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
- "Combination rule mismatch!");
- }
- }
- /* generate table for PME */
- nbp->coulomb_tab = nullptr;
- if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
- {
- GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
- init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
- }
- else
- {
- allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
- }
-
- {
- /* set up LJ parameter lookup table */
- static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
- "Mismatch in the size of host / device data types");
- DeviceBuffer<Float2> nbfp;
- initParamLookupTable(&nbfp,
- nullptr,
- reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
- nbatParams.numTypes * nbatParams.numTypes,
- deviceContext);
- nbp->nbfp = nbfp;
-
- if (ic->vdwtype == VanDerWaalsType::Pme)
- {
- static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
- "Mismatch in the size of host / device data types");
- DeviceBuffer<Float2> nbfp_comb;
- initParamLookupTable(&nbfp_comb,
- nullptr,
- reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
- nbatParams.numTypes,
- deviceContext);
- nbp->nbfp_comb = nbfp_comb;
- }
- }
-}
-
/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name)
{
nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
}
-//! This function is documented in the header file
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t* nbat,
- const bool bLocalAndNonlocal)
+void gpu_init_platform_specific(NbnxmGpu* nb)
{
- GMX_ASSERT(ic, "Need a valid interaction constants object");
-
- auto nb = new NbnxmGpu();
- nb->deviceContext_ = &deviceStreamManager.context();
- snew(nb->atdat, 1);
- snew(nb->nbparam, 1);
- snew(nb->plist[InteractionLocality::Local], 1);
- if (bLocalAndNonlocal)
- {
- snew(nb->plist[InteractionLocality::NonLocal], 1);
- }
-
- nb->bUseTwoStreams = bLocalAndNonlocal;
-
- nb->timers = new Nbnxm::GpuTimers();
- snew(nb->timings, 1);
-
/* set device info, just point it to the right GPU among the detected ones */
nb->dev_rundata = new gmx_device_runtime_data_t();
- /* init nbst */
- pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
-
- init_plist(nb->plist[InteractionLocality::Local]);
-
- /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
- nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-
- /* local/non-local GPU streams */
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
- "Local non-bonded stream should be initialized to use GPU for non-bonded.");
- const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
- nb->deviceStreams[InteractionLocality::Local] = &localStream;
-
- if (nb->bUseTwoStreams)
- {
- init_plist(nb->plist[InteractionLocality::NonLocal]);
-
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
- "Non-local non-bonded stream should be initialized to use GPU for "
- "non-bonded with domain decomposition.");
- nb->deviceStreams[InteractionLocality::NonLocal] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
- }
-
- if (nb->bDoTime)
- {
- init_timings(nb->timings);
- }
-
- const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
- const DeviceContext& deviceContext = *nb->deviceContext_;
- init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
- init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
-
/* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
* TODO: decide about NVIDIA
*/
*/
nbnxn_gpu_compile_kernels(nb);
nbnxn_gpu_init_kernels(nb);
-
- if (debug)
- {
- fprintf(debug, "Initialized OpenCL data structures.\n");
- }
-
- return nb;
}
//! This function is documented in the header file
return;
}
+ delete nb->timers;
+ sfree(nb->timings);
+
+ NBAtomData* atdat = nb->atdat;
+ NBParamGpu* nbparam = nb->nbparam;
+
/* Free kernels */
// NOLINTNEXTLINE(bugprone-sizeof-expression)
int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
freeDeviceBuffer(&(nb->atdat->eLJ));
freeDeviceBuffer(&(nb->atdat->eElec));
freeDeviceBuffer(&(nb->atdat->fShift));
- freeDeviceBuffer(&(nb->atdat->ljComb));
- freeDeviceBuffer(&(nb->atdat->atomTypes));
freeDeviceBuffer(&(nb->atdat->shiftVec));
- sfree(nb->atdat);
+ if (useLjCombRule(nb->nbparam->vdwType))
+ {
+ freeDeviceBuffer(&atdat->ljComb);
+ }
+ else
+ {
+ freeDeviceBuffer(&atdat->atomTypes);
+ }
/* Free nbparam */
- freeDeviceBuffer(&(nb->nbparam->nbfp));
- freeDeviceBuffer(&(nb->nbparam->nbfp_comb));
- freeDeviceBuffer(&(nb->nbparam->coulomb_tab));
- sfree(nb->nbparam);
+ if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
+ {
+ destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+ }
+
+ if (!useLjCombRule(nb->nbparam->vdwType))
+ {
+ destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+ }
+
+ if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+ {
+ destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+ }
/* Free plist */
auto* plist = nb->plist[InteractionLocality::Local];
freeDeviceBuffer(&plist->cj4);
freeDeviceBuffer(&plist->imask);
freeDeviceBuffer(&plist->excl);
- sfree(plist);
+ delete plist;
if (nb->bUseTwoStreams)
{
auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
freeDeviceBuffer(&plist_nl->cj4);
freeDeviceBuffer(&plist_nl->imask);
freeDeviceBuffer(&plist_nl->excl);
- sfree(plist_nl);
+ delete plist_nl;
}
/* Free nbst */
freeGpuProgram(nb->dev_rundata->program);
delete nb->dev_rundata;
- /* Free timers and timings */
- delete nb->timers;
- sfree(nb->timings);
+ delete atdat;
+ delete nbparam;
delete nb;
if (debug)
*/
#include "gmxpre.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/pmalloc.h"
#include "gromacs/hardware/device_information.h"
#include "gromacs/mdtypes/interaction_const.h"
namespace Nbnxm
{
-/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
-static void initAtomdataFirst(NBAtomData* atomdata,
- int numTypes,
- const DeviceContext& deviceContext,
- const DeviceStream& localStream)
+void gpu_init_platform_specific(NbnxmGpu* /* nb */)
{
- atomdata->numTypes = numTypes;
- allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
- atomdata->shiftVecUploaded = false;
-
- allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
- allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
- allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
-
- clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
- clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
- clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
-
- /* initialize to nullptr pointers to data that is not allocated here and will
- need reallocation in later */
- atomdata->xq = nullptr;
- atomdata->f = nullptr;
-
- /* size -1 indicates that the respective array hasn't been initialized yet */
- atomdata->numAtoms = -1;
- atomdata->numAtomsAlloc = -1;
-}
-
-/*! \brief Initialize the nonbonded parameter data structure. */
-static void initNbparam(NBParamGpu* nbp,
- const interaction_const_t& ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t::Params& nbatParams,
- const DeviceContext& deviceContext)
-{
- const int numTypes = nbatParams.numTypes;
-
- set_cutoff_parameters(nbp, &ic, listParams);
-
- nbp->vdwType = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
- nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
-
- /* generate table for PME */
- nbp->coulomb_tab = nullptr;
- if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
- {
- GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
- init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
- }
-
- /* set up LJ parameter lookup table */
- if (!useLjCombRule(nbp->vdwType))
- {
- static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
- "Mismatch in the size of host / device data types");
- initParamLookupTable(&nbp->nbfp,
- &nbp->nbfp_texobj,
- reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
- numTypes * numTypes,
- deviceContext);
- }
-
- /* set up LJ-PME parameter lookup table */
- if (ic.vdwtype == VanDerWaalsType::Pme)
- {
- static_assert(sizeof(decltype(nbp->nbfp_comb))
- == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
- "Mismatch in the size of host / device data types");
- initParamLookupTable(&nbp->nbfp_comb,
- &nbp->nbfp_comb_texobj,
- reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
- numTypes,
- deviceContext);
- }
-}
-
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
- const interaction_const_t* ic,
- const PairlistParams& listParams,
- const nbnxn_atomdata_t* nbat,
- const bool bLocalAndNonlocal)
-{
- auto* nb = new NbnxmGpu();
- nb->deviceContext_ = &deviceStreamManager.context();
- nb->atdat = new NBAtomData;
- nb->nbparam = new NBParamGpu;
- nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
- if (bLocalAndNonlocal)
- {
- nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
- }
-
- nb->bUseTwoStreams = bLocalAndNonlocal;
-
- nb->timers = nullptr;
- nb->timings = nullptr;
-
- /* init nbst */
- pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
- pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
-
- init_plist(nb->plist[InteractionLocality::Local]);
-
- /* local/non-local GPU streams */
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
- "Local non-bonded stream should be initialized to use GPU for non-bonded.");
- const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
- nb->deviceStreams[InteractionLocality::Local] = &localStream;
- // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
- // out-of-order. But for the time being, it will be less disruptive to keep them.
- if (nb->bUseTwoStreams)
- {
- init_plist(nb->plist[InteractionLocality::NonLocal]);
-
- GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
- "Non-local non-bonded stream should be initialized to use GPU for "
- "non-bonded with domain decomposition.");
- nb->deviceStreams[InteractionLocality::NonLocal] =
- &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
- }
-
- nb->bDoTime = false;
-
- const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
- const DeviceContext& deviceContext = *nb->deviceContext_;
-
- initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
- initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
-
- return nb;
+ // Nothing specific in SYCL
}
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
return;
}
+ delete nb->timers;
+ sfree(nb->timings);
+
NBAtomData* atdat = nb->atdat;
NBParamGpu* nbparam = nb->nbparam;
- if ((!nbparam->coulomb_tab)
- && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+ if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
{
destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
}