From 28d7dbb688b4c6d83696e9e795795a83dbc670d2 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Thu, 17 Jan 2019 14:25:50 +0100 Subject: [PATCH] Clean up nbnxm enums Converted the nbnxm kernel and ewald excl enums to enum classes. Added resource, kernel setup and pairlist type enums. Also made pairlistSets_ private. Change-Id: Ic8ff8c59cf8a72d3462b8a0f7382cd1636136c40 --- src/gromacs/mdlib/calc_verletbuf.cpp | 19 +- src/gromacs/mdlib/calc_verletbuf.h | 10 +- src/gromacs/mdlib/sim_util.cpp | 64 ++--- src/gromacs/nbnxm/atomdata.cpp | 27 +-- src/gromacs/nbnxm/atomdata.h | 7 +- src/gromacs/nbnxm/grid.cpp | 4 +- src/gromacs/nbnxm/kerneldispatch.cpp | 57 +++-- src/gromacs/nbnxm/nbnxm.h | 221 ++++++++++++------ src/gromacs/nbnxm/nbnxm_geometry.cpp | 76 ------ src/gromacs/nbnxm/nbnxm_geometry.h | 50 +++- src/gromacs/nbnxm/nbnxm_setup.cpp | 203 +++++++++------- src/gromacs/nbnxm/pairlist.cpp | 59 ++--- src/gromacs/nbnxm/pairlist.h | 72 +++--- src/gromacs/nbnxm/pairlist_tuning.cpp | 9 +- src/gromacs/nbnxm/pairlist_tuning.h | 2 - src/gromacs/nbnxm/pairlistset.cpp | 56 ++++- src/gromacs/nbnxm/pairlistset.h | 8 +- src/gromacs/nbnxm/prunekerneldispatch.cpp | 19 +- src/gromacs/taskassignment/decidegpuusage.cpp | 1 - src/gromacs/taskassignment/decidegpuusage.h | 11 +- 20 files changed, 562 insertions(+), 413 deletions(-) diff --git a/src/gromacs/mdlib/calc_verletbuf.cpp b/src/gromacs/mdlib/calc_verletbuf.cpp index 7bee75dae4..b494e16652 100644 --- a/src/gromacs/mdlib/calc_verletbuf.cpp +++ b/src/gromacs/mdlib/calc_verletbuf.cpp @@ -107,18 +107,17 @@ struct pot_derivatives_t real md3; // -V''' at the cutoff }; -VerletbufListSetup verletbufGetListSetup(int nbnxnKernelType) +VerletbufListSetup verletbufGetListSetup(Nbnxm::KernelType nbnxnKernelType) { /* Note that the current buffer estimation code only handles clusters * of size 1, 2 or 4, so for 4x8 or 8x8 we use the estimate for 4x4. */ VerletbufListSetup listSetup; - listSetup.cluster_size_i = nbnxn_kernel_to_cluster_i_size(nbnxnKernelType); - listSetup.cluster_size_j = nbnxn_kernel_to_cluster_j_size(nbnxnKernelType); + listSetup.cluster_size_i = Nbnxm::IClusterSizePerKernelType[nbnxnKernelType]; + listSetup.cluster_size_j = Nbnxm::JClusterSizePerKernelType[nbnxnKernelType]; - if (nbnxnKernelType == nbnxnk8x8x8_GPU || - nbnxnKernelType == nbnxnk8x8x8_PlainC) + if (!Nbnxm::kernelTypeUsesSimplePairlist(nbnxnKernelType)) { /* The GPU kernels (except for OpenCL) split the j-clusters in two halves */ listSetup.cluster_size_j /= 2; @@ -134,24 +133,24 @@ VerletbufListSetup verletbufGetSafeListSetup(ListSetupType listType) * i- and j-cluster sizes, so we potentially overestimate, but never * underestimate, the buffer drift. */ - int nbnxnKernelType; + Nbnxm::KernelType nbnxnKernelType; if (listType == ListSetupType::Gpu) { - nbnxnKernelType = nbnxnk8x8x8_GPU; + nbnxnKernelType = Nbnxm::KernelType::Gpu8x8x8; } else if (GMX_SIMD && listType == ListSetupType::CpuSimdWhenSupported) { #ifdef GMX_NBNXN_SIMD_2XNN /* We use the smallest cluster size to be on the safe side */ - nbnxnKernelType = nbnxnk4xN_SIMD_2xNN; + nbnxnKernelType = Nbnxm::KernelType::Cpu4xN_Simd_2xNN; #else - nbnxnKernelType = nbnxnk4xN_SIMD_4xN; + nbnxnKernelType = Nbnxm::KernelType::Cpu4xN_Simd_4xN; #endif } else { - nbnxnKernelType = nbnxnk4x4_PlainC; + nbnxnKernelType = Nbnxm::KernelType::Cpu4x4_PlainC; } return verletbufGetListSetup(nbnxnKernelType); diff --git a/src/gromacs/mdlib/calc_verletbuf.h b/src/gromacs/mdlib/calc_verletbuf.h index a444b2e097..1a8cdc34ee 100644 --- a/src/gromacs/mdlib/calc_verletbuf.h +++ b/src/gromacs/mdlib/calc_verletbuf.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -48,6 +48,12 @@ namespace gmx class RangePartitioning; } // namespace gmx +namespace Nbnxm +{ +enum class KernelType; +} // namespace Nbnxm + + struct VerletbufListSetup { int cluster_size_i; /* Cluster pair-list i-cluster size atom count */ @@ -68,7 +74,7 @@ static const real verlet_buffer_ratio_NVE_T0 = 0.10; /* Returns the pair-list setup for the given nbnxn kernel type. */ -VerletbufListSetup verletbufGetListSetup(int nbnxnKernelType); +VerletbufListSetup verletbufGetListSetup(Nbnxm::KernelType nbnxnKernelType); /* Enum for choosing the list type for verletbufGetSafeListSetup() */ enum class ListSetupType diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index f68d7c841a..b89f57fd5b 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -423,7 +423,7 @@ static void do_nb_verlet(t_forcerec *fr, * the current coordinates of the atoms. */ wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING); - NbnxnDispatchPruneKernel(nbv, ilocality, fr->shift_vec); + nbv->dispatchPruneKernel(ilocality, fr->shift_vec); wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING); } @@ -438,23 +438,23 @@ static void do_nb_verlet(t_forcerec *fr, } } -static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists, - t_forcerec *fr, - rvec x[], - rvec f[], - const t_mdatoms *mdatoms, - t_lambda *fepvals, - real *lambda, - gmx_enerdata_t *enerd, - int flags, - t_nrnb *nrnb, - gmx_wallcycle_t wcycle) +static void do_nb_verlet_fep(const nonbonded_verlet_t &nbv, + const Nbnxm::InteractionLocality iLocality, + t_forcerec *fr, + rvec x[], + rvec f[], + const t_mdatoms *mdatoms, + t_lambda *fepvals, + real *lambda, + gmx_enerdata_t *enerd, + int flags, + t_nrnb *nrnb, + gmx_wallcycle_t wcycle) { int donb_flags; nb_kernel_data_t kernel_data; real lam_i[efptNR]; real dvdl_nb[efptNR]; - int th; int i, j; donb_flags = 0; @@ -488,15 +488,17 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists, dvdl_nb[i] = 0; } - GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl, "Number of lists should be same as number of NB threads"); + const gmx::ArrayRef nbl_fep = nbv.freeEnergyPairlistSet(iLocality); + + GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads"); wallcycle_sub_start(wcycle, ewcsNONBONDED); -#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl) - for (th = 0; th < nbl_lists->nnbl; th++) +#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) + for (int th = 0; th < nbl_fep.ssize(); th++) { try { - gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th], + gmx_nb_free_energy_kernel(nbl_fep[th], x, f, fr, mdatoms, &kernel_data, nrnb); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; @@ -531,12 +533,12 @@ static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists, lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]); } reset_foreign_enerdata(enerd); -#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl) - for (th = 0; th < nbl_lists->nnbl; th++) +#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) + for (int th = 0; th < nbl_fep.ssize(); th++) { try { - gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th], + gmx_nb_free_energy_kernel(nbl_fep[th], x, f, fr, mdatoms, &kernel_data, nrnb); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; @@ -1138,8 +1140,8 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_start_nocount(wcycle, ewcNS); wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL); /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ - nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::Local, - &top->excls, step, nrnb); + nbv->constructPairlist(Nbnxm::InteractionLocality::Local, + &top->excls, step, nrnb); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL); wallcycle_stop(wcycle, ewcNS); } @@ -1195,8 +1197,8 @@ static void do_force_cutsVERLET(FILE *fplog, wallcycle_start_nocount(wcycle, ewcNS); wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL); /* Note that with a GPU the launch overhead of the list transfer is not timed separately */ - nbnxn_make_pairlist(nbv, Nbnxm::InteractionLocality::NonLocal, - &top->excls, step, nrnb); + nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal, + &top->excls, step, nrnb); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); wallcycle_stop(wcycle, ewcNS); } @@ -1362,18 +1364,18 @@ static void do_force_cutsVERLET(FILE *fplog, /* Calculate the local and non-local free energy interactions here. * Happens here on the CPU both with and without GPU. */ - if (fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local].nbl_fep[0]->nrj > 0) + if (fr->nbv->freeEnergyPairlistSet(Nbnxm::InteractionLocality::Local)[0]->nrj > 0) { - do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::Local], + do_nb_verlet_fep(*nbv, Nbnxm::InteractionLocality::Local, fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms, inputrec->fepvals, lambda, enerd, flags, nrnb, wcycle); } - if (DOMAINDECOMP(cr) && - fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nbl_fep[0]->nrj > 0) + if (havePPDomainDecomposition(cr) && + fr->nbv->freeEnergyPairlistSet(Nbnxm::InteractionLocality::NonLocal)[0]->nrj > 0) { - do_nb_verlet_fep(&fr->nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal], + do_nb_verlet_fep(*nbv, Nbnxm::InteractionLocality::NonLocal, fr, as_rvec_array(x.unpaddedArrayRef().data()), f, mdatoms, inputrec->fepvals, lambda, enerd, flags, nrnb, wcycle); @@ -1403,7 +1405,7 @@ static void do_force_cutsVERLET(FILE *fplog, /* if there are multiple fshift output buffers reduce them */ if ((flags & GMX_FORCE_VIRIAL) && - nbv->pairlistSets[iloc].nnbl > 1) + nbv->pairlistSet(iloc).nnbl > 1) { /* This is not in a subcounter because it takes a negligible and constant-sized amount of time */ @@ -1458,7 +1460,7 @@ static void do_force_cutsVERLET(FILE *fplog, } /* skip the reduction if there was no non-local work to do */ - if (!nbv->pairlistSets[Nbnxm::InteractionLocality::NonLocal].nblGpu[0]->sci.empty()) + if (!nbv->pairlistSet(Nbnxm::InteractionLocality::NonLocal).nblGpu[0]->sci.empty()) { nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs.get(), Nbnxm::AtomLocality::NonLocal, nbv->nbat, f, wcycle); diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 62ba0c4014..0fad12d1f8 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -91,7 +91,7 @@ void nbnxn_atomdata_t::resizeForceBuffers() } /* Initializes an nbnxn_atomdata_output_t data structure */ -nbnxn_atomdata_output_t::nbnxn_atomdata_output_t(int nb_kernel_type, +nbnxn_atomdata_output_t::nbnxn_atomdata_output_t(Nbnxm::KernelType kernelType, int numEnergyGroups, int simdEnergyBufferStride, gmx::PinningPolicy pinningPolicy) : @@ -104,10 +104,9 @@ nbnxn_atomdata_output_t::nbnxn_atomdata_output_t(int nb_kernel_ty Vvdw.resize(numEnergyGroups*numEnergyGroups); Vc.resize(numEnergyGroups*numEnergyGroups); - if (nb_kernel_type == nbnxnk4xN_SIMD_4xN || - nb_kernel_type == nbnxnk4xN_SIMD_2xNN) + if (Nbnxm::kernelTypeIsSimd(kernelType)) { - int cj_size = nbnxn_kernel_to_cluster_j_size(nb_kernel_type); + int cj_size = Nbnxm::JClusterSizePerKernelType[kernelType]; int numElements = numEnergyGroups*numEnergyGroups*simdEnergyBufferStride*(cj_size/2)*cj_size; VSvdw.resize(numElements); VSc.resize(numElements); @@ -438,7 +437,7 @@ nbnxn_atomdata_t::nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy) : /* Initializes an nbnxn_atomdata_t::Params data structure */ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, nbnxn_atomdata_t::Params *params, - int nb_kernel_type, + const Nbnxm::KernelType kernelType, int enbnxninitcombrule, int ntype, const real *nbfp, int n_energygroups) @@ -538,7 +537,7 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, gmx::boolToString(bCombGeom), gmx::boolToString(bCombLB)); } - simple = nbnxn_kernel_pairlist_simple(nb_kernel_type); + simple = Nbnxm::kernelTypeUsesSimplePairlist(kernelType); switch (enbnxninitcombrule) { @@ -590,8 +589,7 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, gmx_incons("Unknown enbnxninitcombrule"); } - bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN || - nb_kernel_type == nbnxnk4xN_SIMD_2xNN); + bSIMD = Nbnxm::kernelTypeIsSimd(kernelType); set_lj_parameter_data(params, bSIMD); @@ -616,18 +614,17 @@ static void nbnxn_atomdata_params_init(const gmx::MDLogger &mdlog, /* Initializes an nbnxn_atomdata_t data structure */ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, nbnxn_atomdata_t *nbat, - int nb_kernel_type, + const Nbnxm::KernelType kernelType, int enbnxninitcombrule, int ntype, const real *nbfp, int n_energygroups, int nout) { - nbnxn_atomdata_params_init(mdlog, &nbat->paramsDeprecated(), nb_kernel_type, + nbnxn_atomdata_params_init(mdlog, &nbat->paramsDeprecated(), kernelType, enbnxninitcombrule, ntype, nbfp, n_energygroups); - const gmx_bool simple = nbnxn_kernel_pairlist_simple(nb_kernel_type); - const gmx_bool bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN || - nb_kernel_type == nbnxnk4xN_SIMD_2xNN); + const bool simple = Nbnxm::kernelTypeUsesSimplePairlist(kernelType); + const bool bSIMD = Nbnxm::kernelTypeIsSimd(kernelType); if (simple) { @@ -636,7 +633,7 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, if (bSIMD) { pack_x = std::max(c_nbnxnCpuIClusterSize, - nbnxn_kernel_to_cluster_j_size(nb_kernel_type)); + Nbnxm::JClusterSizePerKernelType[kernelType]); switch (pack_x) { case 4: @@ -671,7 +668,7 @@ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, for (int i = 0; i < nout; i++) { const auto &pinningPolicy = nbat->params().type.get_allocator().pinningPolicy(); - nbat->out.emplace_back(nb_kernel_type, nbat->params().nenergrp, 1 << nbat->params().neg_2log, + nbat->out.emplace_back(kernelType, nbat->params().nenergrp, 1 << nbat->params().neg_2log, pinningPolicy); } diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index bd210ea014..fbd4dd3c00 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -54,6 +54,11 @@ struct nbnxn_search; struct t_mdatoms; struct gmx_wallcycle; +namespace Nbnxm +{ +enum class KernelType; +} + /* Reallocate the nbnxn_atomdata_t for a size of n atoms */ void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n); @@ -76,7 +81,7 @@ enum { */ void nbnxn_atomdata_init(const gmx::MDLogger &mdlog, nbnxn_atomdata_t *nbat, - int nb_kernel_type, + Nbnxm::KernelType kernelType, int enbnxninitcombrule, int ntype, const real *nbfp, int n_energygroups, diff --git a/src/gromacs/nbnxm/grid.cpp b/src/gromacs/nbnxm/grid.cpp index f4e5148248..7ccf6d18c5 100644 --- a/src/gromacs/nbnxm/grid.cpp +++ b/src/gromacs/nbnxm/grid.cpp @@ -1428,8 +1428,8 @@ void nbnxn_put_on_grid(nonbonded_verlet_t *nbv, grid->bSimple = nbv->pairlistIsSimple(); - grid->na_c = nbnxn_kernel_to_cluster_i_size(nbv->kernelType_); - grid->na_cj = nbnxn_kernel_to_cluster_j_size(nbv->kernelType_); + grid->na_c = IClusterSizePerListType[nbv->listParams->pairlistType]; + grid->na_cj = JClusterSizePerListType[nbv->listParams->pairlistType]; grid->na_sc = (grid->bSimple ? 1 : c_gpuNumClusterPerCell)*grid->na_c; grid->na_c_2log = get_2log(grid->na_c); diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index a1ad25208a..5c302d7c88 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -128,8 +128,7 @@ reduceGroupEnergySimdBuffers(int numGroups, * within this function. * * \param[in] pairlistSet Pairlists with local or non-local interactions to compute - * \param[in] kernel_type The non-bonded kernel type - * \param[in] ewald_excl The Ewald exclusion treatment + * \param[in] kernelSetup The non-bonded kernel setup * \param[in,out] nbat The atomdata for the interactions * \param[in] ic Non-bonded interaction constants * \param[in] shiftVectors The PBC shift vectors @@ -141,8 +140,7 @@ reduceGroupEnergySimdBuffers(int numGroups, */ static void nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, - const int kernel_type, - const int ewald_excl, + const Nbnxm::KernelSetup &kernelSetup, nbnxn_atomdata_t *nbat, const interaction_const_t &ic, rvec *shiftVectors, @@ -160,7 +158,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, } else { - if (ewald_excl == ewaldexclTable) + if (kernelSetup.ewaldExclusionType == Nbnxm::EwaldExclusionType::Table) { if (ic.rcoulomb == ic.rvdw) { @@ -222,7 +220,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, { vdwkt = vdwktLJEWALDCOMBLB; /* At setup we (should have) selected the C reference kernel */ - GMX_RELEASE_ASSERT(kernel_type == nbnxnk4x4_PlainC, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules"); + GMX_RELEASE_ASSERT(kernelSetup.kernelType == Nbnxm::KernelType::Cpu4x4_PlainC, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules"); } } else @@ -264,9 +262,9 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, if (!(forceFlags & GMX_FORCE_ENERGY)) { /* Don't calculate energies */ - switch (kernel_type) + switch (kernelSetup.kernelType) { - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: nbnxn_kernel_noener_ref[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -274,7 +272,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, fshift_p); break; #ifdef GMX_NBNXN_SIMD_2XNN - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -283,7 +281,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, break; #endif #ifdef GMX_NBNXN_SIMD_4XN - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -301,9 +299,9 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, out->Vvdw[0] = 0; out->Vc[0] = 0; - switch (kernel_type) + switch (kernelSetup.kernelType) { - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: nbnxn_kernel_ener_ref[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -313,7 +311,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, out->Vc.data()); break; #ifdef GMX_NBNXN_SIMD_2XNN - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -324,7 +322,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, break; #endif #ifdef GMX_NBNXN_SIMD_4XN - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat, &ic, shiftVectors, @@ -345,9 +343,9 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, int unrollj = 0; - switch (kernel_type) + switch (kernelSetup.kernelType) { - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: unrollj = c_nbnxnCpuIClusterSize; nbnxn_kernel_energrp_ref[coulkt][vdwkt](nbl[nb], nbat, &ic, @@ -358,7 +356,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, out->Vc.data()); break; #ifdef GMX_NBNXN_SIMD_2XNN - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: unrollj = GMX_SIMD_REAL_WIDTH/2; nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat, &ic, @@ -370,7 +368,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, break; #endif #ifdef GMX_NBNXN_SIMD_4XN - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: unrollj = GMX_SIMD_REAL_WIDTH; nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](nbl[nb], nbat, &ic, @@ -385,7 +383,7 @@ nbnxn_kernel_cpu(const nbnxn_pairlist_set_t &pairlistSet, GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture"); } - if (kernel_type != nbnxnk4x4_PlainC) + if (kernelSetup.kernelType != Nbnxm::KernelType::Cpu4x4_PlainC) { switch (unrollj) { @@ -423,7 +421,7 @@ static void accountFlops(t_nrnb *nrnb, const interaction_const_t &ic, const int forceFlags) { - const nbnxn_pairlist_set_t &pairlistSet = nbv.pairlistSets[iLocality]; + const nbnxn_pairlist_set_t &pairlistSet = nbv.pairlistSet(iLocality); const bool usingGpuKernels = nbv.useGpu(); int enr_nbnxn_kernel_ljc; @@ -431,7 +429,7 @@ static void accountFlops(t_nrnb *nrnb, { enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF; } - else if ((!usingGpuKernels && nbv.ewaldExclusionType_ == ewaldexclAnalytical) || + else if ((!usingGpuKernels && nbv.kernelSetup().ewaldExclusionType == Nbnxm::EwaldExclusionType::Analytical) || (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv))) { enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD; @@ -486,16 +484,15 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, gmx_enerdata_t *enerd, t_nrnb *nrnb) { - const nbnxn_pairlist_set_t &pairlistSet = nbv->pairlistSets[iLocality]; + const nbnxn_pairlist_set_t &pairlistSet = nbv->pairlistSet(iLocality); - switch (nbv->kernelType_) + switch (nbv->kernelSetup().kernelType) { - case nbnxnk4x4_PlainC: - case nbnxnk4xN_SIMD_4xN: - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4x4_PlainC: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: nbnxn_kernel_cpu(pairlistSet, - nbv->kernelType_, - nbv->ewaldExclusionType_, + nbv->kernelSetup(), nbv->nbat, ic, fr->shift_vec, @@ -508,11 +505,11 @@ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, enerd->grpp.ener[egLJSR]); break; - case nbnxnk8x8x8_GPU: + case Nbnxm::KernelType::Gpu8x8x8: Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality); break; - case nbnxnk8x8x8_PlainC: + case Nbnxm::KernelType::Cpu8x8x8_PlainC: nbnxn_kernel_gpu_ref(pairlistSet.nblGpu[0], nbv->nbat, &ic, fr->shift_vec, diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 908c6bdc68..640d782b93 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -118,6 +118,7 @@ struct gmx_hw_info_t; struct gmx_mtop_t; struct interaction_const_t; struct nbnxn_pairlist_set_t; +struct nonbonded_verlet_t; struct t_blocka; struct t_commrec; struct t_nrnb; @@ -130,87 +131,173 @@ class MDLogger; class UpdateGroupsCog; } -//! Help pass GPU-emulation parameters with type safety. -enum class EmulateGpuNonbonded : bool +/*! \brief Resources that can be used to execute non-bonded kernels on */ +enum class NonbondedResource : int { - //! Do not emulate GPUs. - No, - //! Do emulate GPUs. - Yes + Cpu, + Gpu, + EmulateGpu }; +namespace Nbnxm +{ /*! \brief Nonbonded NxN kernel types: plain C, CPU SIMD, GPU, GPU emulation */ -typedef enum +enum class KernelType : int { - nbnxnkNotSet = 0, - nbnxnk4x4_PlainC, - nbnxnk4xN_SIMD_4xN, - nbnxnk4xN_SIMD_2xNN, - nbnxnk8x8x8_GPU, - nbnxnk8x8x8_PlainC, - nbnxnkNR -} nbnxn_kernel_type; + NotSet = 0, + Cpu4x4_PlainC, + Cpu4xN_Simd_4xN, + Cpu4xN_Simd_2xNN, + Gpu8x8x8, + Cpu8x8x8_PlainC, + Count +}; -namespace Nbnxm +/*! \brief Ewald exclusion types */ +enum class EwaldExclusionType : int +{ + NotSet = 0, + Table, + Analytical, + DecidedByGpuModule +}; + +/* \brief The non-bonded setup, also affects the pairlist construction kernel */ +struct KernelSetup { + //! The non-bonded type, also affects the pairlist construction kernel + KernelType kernelType = KernelType::NotSet; + //! Ewald exclusion computation handling type, currently only used for CPU + EwaldExclusionType ewaldExclusionType = EwaldExclusionType::NotSet; +}; /*! \brief Return a string identifying the kernel type. * - * \param [in] kernel_type nonbonded kernel types, takes values from the nbnxn_kernel_type enum - * \returns a string identifying the kernel corresponding to the type passed as argument + * \param [in] kernelType nonbonded kernel type, takes values from the nbnxn_kernel_type enum + * \returns a string identifying the kernel corresponding to the type passed as argument */ -const char *lookup_kernel_name(int kernel_type); +const char *lookup_kernel_name(Nbnxm::KernelType kernelType); } // namespace Nbnxm -/*! \brief Ewald exclusion types */ -enum { - ewaldexclTable, ewaldexclAnalytical -}; - /*! \brief Flag to tell the nonbonded kernels whether to clear the force output buffers */ enum { enbvClearFNo, enbvClearFYes }; +/*! \brief Generates a pair-list for the given locality. + * + * With perturbed particles, also a group scheme style nbl_fep list is made. + */ +void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, + Nbnxm::InteractionLocality iLocality, + nbnxn_pairlist_set_t *pairlistSet, + const t_blocka *excl, + int64_t step, + t_nrnb *nrnb); + +/*! \brief Prune all pair-lists with given locality (currently CPU only) + * + * For all pair-lists with given locality, takes the outer list and prunes out + * pairs beyond the pairlist inner radius and writes the result to a list that is + * to be consumed by the non-bonded kernel. + */ +void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t *pairlistSet, + Nbnxm::KernelType kernelType, + const nbnxn_atomdata_t *nbat, + const rvec *shift_vec); + /*! \libinternal * \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */ struct nonbonded_verlet_t { - //! Returns whether a GPU is used for the non-bonded calculations - bool useGpu() const - { - return kernelType_ == nbnxnk8x8x8_GPU; - } - - //! Returns whether a GPU is emulated for the non-bonded calculations - bool emulateGpu() const - { - return kernelType_ == nbnxnk8x8x8_PlainC; - } - - //! Return whether the pairlist is of simple, CPU type - bool pairlistIsSimple() const - { - return !useGpu() && !emulateGpu(); - } - - std::unique_ptr listParams; /**< Parameters for the search and list pruning setup */ - std::unique_ptr nbs; /**< n vs n atom pair searching data */ - int ngrp; /**< number of interaction groups */ - //! Local and non-local pairlist sets - gmx::EnumerationArray pairlistSets; - //! Atom data - nbnxn_atomdata_t *nbat; - - //! Non-bonded kernel - see enum above - int kernelType_; - //! Ewald exclusion - see enum above - int ewaldExclusionType_; - - gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */ - int min_ci_balanced; /**< pair list balancing parameter used for the 8x8x8 GPU kernels */ + public: + //! Returns whether a GPU is use for the non-bonded calculations + bool useGpu() const + { + return kernelSetup_.kernelType == Nbnxm::KernelType::Gpu8x8x8; + } + + //! Returns whether a GPU is emulated for the non-bonded calculations + bool emulateGpu() const + { + return kernelSetup_.kernelType == Nbnxm::KernelType::Cpu8x8x8_PlainC; + } + + //! Return whether the pairlist is of simple, CPU type + bool pairlistIsSimple() const + { + return !useGpu() && !emulateGpu(); + } + + //! Initialize the pair list sets, TODO this should be private + void initPairlistSets(bool haveMultipleDomains); + + //! Returns a reference to the pairlist set for the requested locality + const nbnxn_pairlist_set_t &pairlistSet(Nbnxm::InteractionLocality iLocality) const + { + GMX_ASSERT(static_cast(iLocality) < pairlistSets_.size(), + "The requested locality should be in the list"); + return pairlistSets_[static_cast(iLocality)]; + } + + //! Constructs the pairlist for the given locality + void constructPairlist(Nbnxm::InteractionLocality iLocality, + const t_blocka *excl, + int64_t step, + t_nrnb *nrnb) + { + nbnxn_make_pairlist(this, iLocality, &pairlistSets_[static_cast(iLocality)], excl, step, nrnb); + } + + //! Dispatches the dynamic pruning kernel for the given locality + void dispatchPruneKernel(Nbnxm::InteractionLocality iLocality, + const rvec *shift_vec) + { + GMX_ASSERT(static_cast(iLocality) < pairlistSets_.size(), + "The requested locality should be in the list"); + NbnxnDispatchPruneKernel(&pairlistSets_[static_cast(iLocality)], + kernelSetup_.kernelType, nbat, shift_vec); + } + + //! Return the kernel setup + const Nbnxm::KernelSetup &kernelSetup() const + { + return kernelSetup_; + } + + //! Sets the kernel setup, TODO: make private + void setKernelSetup(const Nbnxm::KernelSetup &kernelSetup) + { + kernelSetup_ = kernelSetup; + } + + //! Returns the a list of free-energy pairlists for the given locality + const gmx::ArrayRef + freeEnergyPairlistSet(Nbnxm::InteractionLocality iLocality) const + { + return pairlistSet(iLocality).nbl_fep; + } + + //! Parameters for the search and list pruning setup + std::unique_ptr listParams; + //! Working data for constructing the pairlists + std::unique_ptr nbs; + private: + //! Local and, optionally, non-local pairlist sets + std::vector pairlistSets_; + public: + //! Atom data + nbnxn_atomdata_t *nbat; + + private: + //! The non-bonded setup, also affects the pairlist construction kernel + Nbnxm::KernelSetup kernelSetup_; + public: + + gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */ + int min_ci_balanced; /**< pair list balancing parameter used for the 8x8x8 GPU kernels */ }; namespace Nbnxm @@ -276,16 +363,6 @@ void nbnxn_set_atomorder(nbnxn_search_t nbs); /*! \brief Returns the index position of the atoms on the pairlist search grid */ gmx::ArrayRef nbnxn_get_gridindices(const nbnxn_search* nbs); -/*! \brief Generates a pair-list for the given locality. - * - * With perturbed particles, also a group scheme style nbl_fep list is made. - */ -void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, - Nbnxm::InteractionLocality iLocality, - const t_blocka *excl, - int64_t step, - t_nrnb *nrnb); - /*! \brief Returns the number of steps performed with the current pair list */ int nbnxnNumStepsWithPairlist(const nonbonded_verlet_t &nbv, Nbnxm::InteractionLocality ilocality, @@ -296,16 +373,6 @@ bool nbnxnIsDynamicPairlistPruningStep(const nonbonded_verlet_t &nbv, Nbnxm::InteractionLocality ilocality, int64_t step); -/*! \brief Prune all pair-lists with given locality (currently CPU only) - * - * For all pair-lists with given locality, takes the outer list and prunes out - * pairs beyond the pairlist inner radius and writes the result to a list that is - * to be consumed by the non-bonded kernel. - */ -void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, - Nbnxm::InteractionLocality iLocality, - const rvec *shift_vec); - /*! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU */ void NbnxnDispatchKernel(nonbonded_verlet_t *nbv, Nbnxm::InteractionLocality iLocality, diff --git a/src/gromacs/nbnxm/nbnxm_geometry.cpp b/src/gromacs/nbnxm/nbnxm_geometry.cpp index 4c55452e9c..c164034c79 100644 --- a/src/gromacs/nbnxm/nbnxm_geometry.cpp +++ b/src/gromacs/nbnxm/nbnxm_geometry.cpp @@ -39,85 +39,9 @@ #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/pairlist.h" -#include "gromacs/simd/simd.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/real.h" -bool nbnxn_kernel_pairlist_simple(int nb_kernel_type) -{ - if (nb_kernel_type == nbnxnkNotSet) - { - gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list."); - } - - switch (nb_kernel_type) - { - case nbnxnk8x8x8_GPU: - case nbnxnk8x8x8_PlainC: - return false; - - case nbnxnk4x4_PlainC: - case nbnxnk4xN_SIMD_4xN: - case nbnxnk4xN_SIMD_2xNN: - return true; - - default: - gmx_incons("Invalid nonbonded kernel type passed!"); - return false; - } -} - -int nbnxn_kernel_to_cluster_i_size(int nb_kernel_type) -{ - switch (nb_kernel_type) - { - case nbnxnk4x4_PlainC: - case nbnxnk4xN_SIMD_4xN: - case nbnxnk4xN_SIMD_2xNN: - return c_nbnxnCpuIClusterSize; - case nbnxnk8x8x8_GPU: - case nbnxnk8x8x8_PlainC: - /* The cluster size for super/sub lists is only set here. - * Any value should work for the pair-search and atomdata code. - * The kernels, of course, might require a particular value. - */ - return c_nbnxnGpuClusterSize; - default: - gmx_incons("unknown kernel type"); - } -} - -int nbnxn_kernel_to_cluster_j_size(int nb_kernel_type) -{ - int nbnxn_simd_width = 0; - int cj_size = 0; - -#if GMX_SIMD - nbnxn_simd_width = GMX_SIMD_REAL_WIDTH; -#endif - - switch (nb_kernel_type) - { - case nbnxnk4x4_PlainC: - cj_size = c_nbnxnCpuIClusterSize; - break; - case nbnxnk4xN_SIMD_4xN: - cj_size = nbnxn_simd_width; - break; - case nbnxnk4xN_SIMD_2xNN: - cj_size = nbnxn_simd_width/2; - break; - case nbnxnk8x8x8_GPU: - case nbnxnk8x8x8_PlainC: - cj_size = nbnxn_kernel_to_cluster_i_size(nb_kernel_type); - break; - default: - gmx_incons("unknown kernel type"); - } - - return cj_size; -} - /* Clusters at the cut-off only increase rlist by 60% of their size */ static constexpr real c_nbnxnRlistIncreaseOutsideFactor = 0.6; diff --git a/src/gromacs/nbnxm/nbnxm_geometry.h b/src/gromacs/nbnxm/nbnxm_geometry.h index c683ee2b1f..e364ab7060 100644 --- a/src/gromacs/nbnxm/nbnxm_geometry.h +++ b/src/gromacs/nbnxm/nbnxm_geometry.h @@ -37,6 +37,9 @@ #define GMX_NBNXM_NBNXM_GEOMETRY_H #include "gromacs/math/vectypes.h" +#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/pairlist.h" +#include "gromacs/simd/simd.h" #include "gromacs/utility/fatalerror.h" /* Returns the base-2 log of n. @@ -59,14 +62,51 @@ static inline int get_2log(int n) return log2; } +namespace Nbnxm +{ + +/* The nbnxn i-cluster size in atoms for each nbnxn kernel type */ +static constexpr gmx::EnumerationArray IClusterSizePerKernelType = +{ + 0, + c_nbnxnCpuIClusterSize, + c_nbnxnCpuIClusterSize, + c_nbnxnCpuIClusterSize, + c_nbnxnGpuClusterSize, + c_nbnxnGpuClusterSize +}; + +/* The nbnxn j-cluster size in atoms for each nbnxn kernel type */ +static constexpr gmx::EnumerationArray JClusterSizePerKernelType = +{ + 0, + c_nbnxnCpuIClusterSize, +#if GMX_SIMD + GMX_SIMD_REAL_WIDTH, + GMX_SIMD_REAL_WIDTH/2, +#else + 0, + 0, +#endif + c_nbnxnGpuClusterSize, + c_nbnxnGpuClusterSize +}; + /* Returns whether the pair-list corresponding to nb_kernel_type is simple */ -bool nbnxn_kernel_pairlist_simple(int nb_kernel_type); +static inline bool kernelTypeUsesSimplePairlist(const KernelType kernelType) +{ + return (kernelType == KernelType::Cpu4x4_PlainC || + kernelType == KernelType::Cpu4xN_Simd_4xN || + kernelType == KernelType::Cpu4xN_Simd_2xNN); +} -/* Returns the nbnxn i-cluster size in atoms for the nbnxn kernel type */ -int nbnxn_kernel_to_cluster_i_size(int nb_kernel_type); +static inline bool kernelTypeIsSimd(const KernelType kernelType) +{ + return (kernelType == KernelType::Cpu4xN_Simd_4xN || + kernelType == KernelType::Cpu4xN_Simd_2xNN); +} -/* Returns the nbnxn i-cluster size in atoms for the nbnxn kernel type */ -int nbnxn_kernel_to_cluster_j_size(int nb_kernel_type); +} // namespace Nbnxm /* Returns the effective list radius of the pair-list * diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index 44a3d632ca..eef7c9a3ed 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -54,6 +54,7 @@ #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/nbnxm_geometry.h" #include "gromacs/nbnxm/nbnxm_simd.h" +#include "gromacs/nbnxm/pairlist.h" #include "gromacs/nbnxm/pairlist_tuning.h" #include "gromacs/nbnxm/pairlistset.h" #include "gromacs/simd/simd.h" @@ -87,21 +88,24 @@ static gmx_bool nbnxn_simd_supported(const gmx::MDLogger &mdlog, } /*! \brief Returns the most suitable CPU kernel type and Ewald handling */ -static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, - int *kernel_type, - int *ewald_excl, - const gmx_hw_info_t gmx_unused &hardwareInfo) +static KernelSetup +pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, + const gmx_hw_info_t gmx_unused &hardwareInfo) { - *kernel_type = nbnxnk4x4_PlainC; - *ewald_excl = ewaldexclTable; + KernelSetup kernelSetup; -#if GMX_SIMD + if (!GMX_SIMD) + { + kernelSetup.kernelType = KernelType::Cpu4x4_PlainC; + kernelSetup.ewaldExclusionType = EwaldExclusionType::Table; + } + else { #ifdef GMX_NBNXN_SIMD_4XN - *kernel_type = nbnxnk4xN_SIMD_4xN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_4xN; #endif #ifdef GMX_NBNXN_SIMD_2XNN - *kernel_type = nbnxnk4xN_SIMD_2xNN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_2xNN; #endif #if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN @@ -121,22 +125,21 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, * use of HT, use 4x8 to avoid a potential performance hit. * On Intel Haswell 4x8 is always faster. */ - *kernel_type = nbnxnk4xN_SIMD_4xN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_4xN; -#if !GMX_SIMD_HAVE_FMA - if (EEL_PME_EWALD(ir->coulombtype) || - EVDW_PME(ir->vdwtype)) + if (!GMX_SIMD_HAVE_FMA && (EEL_PME_EWALD(ir->coulombtype) || + EVDW_PME(ir->vdwtype))) { /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge). * There are enough instructions to make 2x(4+4) efficient. */ - *kernel_type = nbnxnk4xN_SIMD_2xNN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_2xNN; } -#endif + if (hardwareInfo.haveAmdZenCpu) { /* One 256-bit FMA per cycle makes 2xNN faster */ - *kernel_type = nbnxnk4xN_SIMD_2xNN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_2xNN; } #endif /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */ @@ -144,7 +147,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr) { #ifdef GMX_NBNXN_SIMD_4XN - *kernel_type = nbnxnk4xN_SIMD_4xN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_4xN; #else gmx_fatal(FARGS, "SIMD 4xN kernels requested, but GROMACS has been compiled without support for these kernels"); #endif @@ -152,7 +155,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, if (getenv("GMX_NBNXN_SIMD_2XNN") != nullptr) { #ifdef GMX_NBNXN_SIMD_2XNN - *kernel_type = nbnxnk4xN_SIMD_2xNN; + kernelSetup.kernelType = KernelType::Cpu4xN_Simd_2xNN; #else gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but GROMACS has been compiled without support for these kernels"); #endif @@ -164,53 +167,58 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, * will probably always be faster for a SIMD width of 8 or more. * With FMA analytical is sometimes faster for a width if 4 as well. * In single precision, this is faster on Bulldozer. - */ -#if GMX_SIMD_REAL_WIDTH >= 8 || \ - (GMX_SIMD_REAL_WIDTH >= 4 && GMX_SIMD_HAVE_FMA && !GMX_DOUBLE) - /* On AMD Zen, tabulated Ewald kernels are faster on all 4 combinations + * On AMD Zen, tabulated Ewald kernels are faster on all 4 combinations * of single or double precision and 128 or 256-bit AVX2. */ - if (!hardwareInfo.haveAmdZenCpu) + if ( +#if GMX_SIMD + (GMX_SIMD_REAL_WIDTH >= 8 || + (GMX_SIMD_REAL_WIDTH >= 4 && GMX_SIMD_HAVE_FMA && !GMX_DOUBLE)) && +#endif + !hardwareInfo.haveAmdZenCpu) { - *ewald_excl = ewaldexclAnalytical; + kernelSetup.ewaldExclusionType = EwaldExclusionType::Analytical; + } + else + { + kernelSetup.ewaldExclusionType = EwaldExclusionType::Table; } -#endif if (getenv("GMX_NBNXN_EWALD_TABLE") != nullptr) { - *ewald_excl = ewaldexclTable; + kernelSetup.ewaldExclusionType = EwaldExclusionType::Table; } if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != nullptr) { - *ewald_excl = ewaldexclAnalytical; + kernelSetup.ewaldExclusionType = EwaldExclusionType::Analytical; } } -#endif // GMX_SIMD + + return kernelSetup; } -const char *lookup_kernel_name(int kernel_type) +const char *lookup_kernel_name(const KernelType kernelType) { const char *returnvalue = nullptr; - switch (kernel_type) + switch (kernelType) { - case nbnxnkNotSet: + case KernelType::NotSet: returnvalue = "not set"; break; - case nbnxnk4x4_PlainC: + case KernelType::Cpu4x4_PlainC: returnvalue = "plain C"; break; - case nbnxnk4xN_SIMD_4xN: - case nbnxnk4xN_SIMD_2xNN: + case KernelType::Cpu4xN_Simd_4xN: + case KernelType::Cpu4xN_Simd_2xNN: #if GMX_SIMD returnvalue = "SIMD"; #else // GMX_SIMD returnvalue = "not available"; #endif // GMX_SIMD break; - case nbnxnk8x8x8_GPU: returnvalue = "GPU"; break; - case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break; + case KernelType::Gpu8x8x8: returnvalue = "GPU"; break; + case KernelType::Cpu8x8x8_PlainC: returnvalue = "plain C"; break; - case nbnxnkNR: default: gmx_fatal(FARGS, "Illegal kernel type selected"); } @@ -218,45 +226,42 @@ const char *lookup_kernel_name(int kernel_type) }; /*! \brief Returns the most suitable kernel type and Ewald handling */ -static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog, - gmx_bool use_simd_kernels, - const gmx_hw_info_t &hardwareInfo, - bool useGpu, - EmulateGpuNonbonded emulateGpu, - const t_inputrec *ir, - int *kernel_type, - int *ewald_excl, - gmx_bool bDoNonbonded) +static KernelSetup +pick_nbnxn_kernel(const gmx::MDLogger &mdlog, + gmx_bool use_simd_kernels, + const gmx_hw_info_t &hardwareInfo, + const NonbondedResource &nonbondedResource, + const t_inputrec *ir, + gmx_bool bDoNonbonded) { - GMX_RELEASE_ASSERT(kernel_type, "Need a valid kernel_type pointer"); + KernelSetup kernelSetup; - *kernel_type = nbnxnkNotSet; - *ewald_excl = ewaldexclTable; - - if (emulateGpu == EmulateGpuNonbonded::Yes) + if (nonbondedResource == NonbondedResource::EmulateGpu) { - *kernel_type = nbnxnk8x8x8_PlainC; + kernelSetup.kernelType = KernelType::Cpu8x8x8_PlainC; + kernelSetup.ewaldExclusionType = EwaldExclusionType::DecidedByGpuModule; if (bDoNonbonded) { GMX_LOG(mdlog.warning).asParagraph().appendText("Emulating a GPU run on the CPU (slow)"); } } - else if (useGpu) + else if (nonbondedResource == NonbondedResource::Gpu) { - *kernel_type = nbnxnk8x8x8_GPU; + kernelSetup.kernelType = KernelType::Gpu8x8x8; + kernelSetup.ewaldExclusionType = EwaldExclusionType::DecidedByGpuModule; } - - if (*kernel_type == nbnxnkNotSet) + else { if (use_simd_kernels && nbnxn_simd_supported(mdlog, ir)) { - pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl, hardwareInfo); + kernelSetup = pick_nbnxn_kernel_cpu(ir, hardwareInfo); } else { - *kernel_type = nbnxnk4x4_PlainC; + kernelSetup.kernelType = KernelType::Cpu4x4_PlainC; + kernelSetup.ewaldExclusionType = EwaldExclusionType::Analytical; } } @@ -264,21 +269,41 @@ static void pick_nbnxn_kernel(const gmx::MDLogger &mdlog, { GMX_LOG(mdlog.info).asParagraph().appendTextFormatted( "Using %s %dx%d nonbonded short-range kernels", - lookup_kernel_name(*kernel_type), - nbnxn_kernel_to_cluster_i_size(*kernel_type), - nbnxn_kernel_to_cluster_j_size(*kernel_type)); + lookup_kernel_name(kernelSetup.kernelType), + IClusterSizePerKernelType[kernelSetup.kernelType], + JClusterSizePerKernelType[kernelSetup.kernelType]); - if (nbnxnk4x4_PlainC == *kernel_type || - nbnxnk8x8x8_PlainC == *kernel_type) + if (KernelType::Cpu4x4_PlainC == kernelSetup.kernelType || + KernelType::Cpu8x8x8_PlainC == kernelSetup.kernelType) { GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( "WARNING: Using the slow %s kernels. This should\n" "not happen during routine usage on supported platforms.", - lookup_kernel_name(*kernel_type)); + lookup_kernel_name(kernelSetup.kernelType)); } } + + GMX_RELEASE_ASSERT(kernelSetup.kernelType != KernelType::NotSet && + kernelSetup.ewaldExclusionType != EwaldExclusionType::NotSet, + "All kernel setup parameters should be set here"); + + return kernelSetup; } +} // namespace Nbnxm + +void nonbonded_verlet_t::initPairlistSets(const bool haveMultipleDomains) +{ + pairlistSets_.emplace_back(*listParams); + if (haveMultipleDomains) + { + pairlistSets_.emplace_back(*listParams); + } +} + +namespace Nbnxm +{ + void init_nb_verlet(const gmx::MDLogger &mdlog, nonbonded_verlet_t **nb_verlet, gmx_bool bFEP_NonBonded, @@ -290,38 +315,42 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, const gmx_mtop_t *mtop, matrix box) { - nonbonded_verlet_t *nbv = new nonbonded_verlet_t(); + nonbonded_verlet_t *nbv = new nonbonded_verlet_t(); - const EmulateGpuNonbonded emulateGpu = - ((getenv("GMX_EMULATE_GPU") != nullptr) ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No); - bool useGpu = deviceInfo != nullptr; + const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr); + const bool useGpu = deviceInfo != nullptr; - GMX_RELEASE_ASSERT(!(emulateGpu == EmulateGpuNonbonded::Yes && useGpu), "When GPU emulation is active, there cannot be a GPU assignment"); + GMX_RELEASE_ASSERT(!(emulateGpu && useGpu), "When GPU emulation is active, there cannot be a GPU assignment"); + + NonbondedResource nonbondedResource; + if (useGpu) + { + nonbondedResource = NonbondedResource::Gpu; + } + else if (emulateGpu) + { + nonbondedResource = NonbondedResource::EmulateGpu; + } + else + { + nonbondedResource = NonbondedResource::Cpu; + } nbv->nbs = nullptr; - pick_nbnxn_kernel(mdlog, fr->use_simd_kernels, hardwareInfo, - useGpu, emulateGpu, ir, - &nbv->kernelType_, - &nbv->ewaldExclusionType_, - fr->bNonbonded); + nbv->setKernelSetup(pick_nbnxn_kernel(mdlog, fr->use_simd_kernels, hardwareInfo, + nonbondedResource, ir, + fr->bNonbonded)); const bool haveMultipleDomains = (DOMAINDECOMP(cr) && cr->dd->nnodes > 1); - const bool pairlistIsSimple = nbv->pairlistIsSimple(); - for (nbnxn_pairlist_set_t &pairlistSet : nbv->pairlistSets) - { - // TODO Change this to a constructor - /* The second parameter tells whether lists should be combined, - * this is currently only and always done for GPU lists. - */ - nbnxn_init_pairlist_set(&pairlistSet, pairlistIsSimple, !pairlistIsSimple); - } + nbv->listParams = std::make_unique(nbv->kernelSetup().kernelType, + ir->rlist); + nbv->initPairlistSets(haveMultipleDomains); nbv->min_ci_balanced = 0; - nbv->listParams = std::make_unique(ir->rlist); - setupDynamicPairlistPruning(mdlog, ir, mtop, box, nbv->kernelType_, fr->ic, + setupDynamicPairlistPruning(mdlog, ir, mtop, box, fr->ic, nbv->listParams.get()); nbv->nbs = std::make_unique(ir->ePBC, @@ -369,11 +398,11 @@ void init_nb_verlet(const gmx::MDLogger &mdlog, } nbnxn_atomdata_init(mdlog, nbv->nbat, - nbv->kernelType_, + nbv->kernelSetup().kernelType, enbnxninitcombrule, fr->ntype, fr->nbfp, mimimumNumEnergyGroupNonbonded, - pairlistIsSimple ? gmx_omp_nthreads_get(emntNonbonded) : 1); + nbv->pairlistIsSimple() ? gmx_omp_nthreads_get(emntNonbonded) : 1); if (useGpu) { diff --git a/src/gromacs/nbnxm/pairlist.cpp b/src/gromacs/nbnxm/pairlist.cpp index 9a4b667756..06293a0261 100644 --- a/src/gromacs/nbnxm/pairlist.cpp +++ b/src/gromacs/nbnxm/pairlist.cpp @@ -804,13 +804,14 @@ NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) : work = new NbnxnPairlistGpuWork(); } -void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, - gmx_bool bSimple, gmx_bool bCombined) +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list) { - GMX_RELEASE_ASSERT(!bSimple || !bCombined, "Can only combine non-simple lists"); - - nbl_list->bSimple = bSimple; - nbl_list->bCombined = bCombined; + nbl_list->bSimple = + (nbl_list->params.pairlistType == PairlistType::Simple4x2 || + nbl_list->params.pairlistType == PairlistType::Simple4x4 || + nbl_list->params.pairlistType == PairlistType::Simple4x8); + // Currently GPU lists are always combined + nbl_list->bCombined = !nbl_list->bSimple; nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded); @@ -821,7 +822,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS); } - if (bSimple) + if (nbl_list->bSimple) { snew(nbl_list->nbl, nbl_list->nnbl); if (nbl_list->nnbl > 1) @@ -833,7 +834,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, { snew(nbl_list->nblGpu, nbl_list->nnbl); } - snew(nbl_list->nbl_fep, nbl_list->nnbl); + nbl_list->nbl_fep.resize(nbl_list->nnbl); /* Execute in order to avoid memory interleaving between threads */ #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static) for (int i = 0; i < nbl_list->nnbl; i++) @@ -843,7 +844,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, /* Allocate the nblist data structure locally on each thread * to optimize memory access for NUMA architectures. */ - if (bSimple) + if (nbl_list->bSimple) { nbl_list->nbl[i] = new NbnxnPairlistCpu(); @@ -2455,24 +2456,24 @@ static void icell_set_x_simple(int ci, static void icell_set_x(int ci, real shx, real shy, real shz, int stride, const real *x, - int nb_kernel_type, + const Nbnxm::KernelType kernelType, NbnxnPairlistCpuWork *work) { - switch (nb_kernel_type) + switch (kernelType) { #if GMX_SIMD #ifdef GMX_NBNXN_SIMD_4XN - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: icell_set_x_simd_4xn(ci, shx, shy, shz, stride, x, work); break; #endif #ifdef GMX_NBNXN_SIMD_2XNN - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: icell_set_x_simd_2xnn(ci, shx, shy, shz, stride, x, work); break; #endif #endif - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: icell_set_x_simple(ci, shx, shy, shz, stride, x, &work->iClusterData); break; default: @@ -2485,7 +2486,7 @@ static void icell_set_x(int ci, static void icell_set_x(int ci, real shx, real shy, real shz, int stride, const real *x, - int gmx_unused nb_kernel_type, + Nbnxm::KernelType gmx_unused kernelType, NbnxnPairlistGpuWork *work) { #if !GMX_SIMD4_HAVE_REAL @@ -3116,12 +3117,12 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, const nbnxn_atomdata_t *nbat, const real rlist2, const real rbb2, - const int nb_kernel_type, + const Nbnxm::KernelType kernelType, int *numDistanceChecks) { - switch (nb_kernel_type) + switch (kernelType) { - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: makeClusterListSimple(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, @@ -3130,7 +3131,7 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, numDistanceChecks); break; #ifdef GMX_NBNXN_SIMD_4XN - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: makeClusterListSimd4xn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, @@ -3140,7 +3141,7 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, break; #endif #ifdef GMX_NBNXN_SIMD_2XNN - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: makeClusterListSimd2xnn(jGrid, nbl, ci, firstCell, lastCell, excludeSubDiagonal, @@ -3149,6 +3150,8 @@ static void makeClusterListWrapper(NbnxnPairlistCpu *nbl, numDistanceChecks); break; #endif + default: + GMX_ASSERT(false, "Unhandled kernel type"); } } @@ -3162,7 +3165,7 @@ static void makeClusterListWrapper(NbnxnPairlistGpu *nbl, const nbnxn_atomdata_t *nbat, const real rlist2, const real rbb2, - const int gmx_unused nb_kernel_type, + Nbnxm::KernelType gmx_unused kernelType, int *numDistanceChecks) { for (int cj = firstCell; cj <= lastCell; cj++) @@ -3248,7 +3251,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, const nbnxn_atomdata_t *nbat, const t_blocka &exclusions, real rlist, - int nb_kernel_type, + const Nbnxm::KernelType kernelType, int ci_block, gmx_bool bFBufferFlag, int nsubpair_max, @@ -3283,7 +3286,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, sync_work(nbl); GMX_ASSERT(nbl->na_ci == jGrid.na_c, "The cluster sizes in the list and grid should match"); - nbl->na_cj = nbnxn_kernel_to_cluster_j_size(nb_kernel_type); + nbl->na_cj = Nbnxm::JClusterSizePerKernelType[kernelType]; na_cj_2log = get_2log(nbl->na_cj); nbl->rlist = rlist; @@ -3542,7 +3545,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, icell_set_x(cell0_i+ci, shx, shy, shz, nbat->xstride, nbat->x().data(), - nb_kernel_type, + kernelType, nbl->work); for (int cx = cxf; cx <= cxl; cx++) @@ -3687,7 +3690,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search *nbs, excludeSubDiagonal, nbat, rlist2, rbb2, - nb_kernel_type, + kernelType, &numDistanceChecks); if (bFBufferFlag) @@ -4034,6 +4037,7 @@ static void sort_sci(NbnxnPairlistGpu *nbl) void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, const InteractionLocality iLocality, + nbnxn_pairlist_set_t *nbl_list, const t_blocka *excl, const int64_t step, t_nrnb *nrnb) @@ -4041,7 +4045,6 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nbnxn_search *nbs = nbv->nbs.get(); nbnxn_atomdata_t *nbat = nbv->nbat; const real rlist = nbv->listParams->rlistOuter; - nbnxn_pairlist_set_t *nbl_list = &nbv->pairlistSets[iLocality]; int nsubpair_target; float nsubpair_tot_est; @@ -4170,7 +4173,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nbnxn_make_pairlist_part(nbs, iGrid, jGrid, &nbs->work[th], nbat, *excl, rlist, - nbv->kernelType_, + nbv->kernelSetup().kernelType, ci_block, nbat->bUseBufferFlags, nsubpair_target, @@ -4184,7 +4187,7 @@ void nbnxn_make_pairlist(nonbonded_verlet_t *nbv, nbnxn_make_pairlist_part(nbs, iGrid, jGrid, &nbs->work[th], nbat, *excl, rlist, - nbv->kernelType_, + nbv->kernelSetup().kernelType, ci_block, nbat->bUseBufferFlags, nsubpair_target, diff --git a/src/gromacs/nbnxm/pairlist.h b/src/gromacs/nbnxm/pairlist.h index 74bf78fc8f..71d90b3883 100644 --- a/src/gromacs/nbnxm/pairlist.h +++ b/src/gromacs/nbnxm/pairlist.h @@ -46,6 +46,7 @@ #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/bitmask.h" #include "gromacs/utility/defaultinitializationallocator.h" +#include "gromacs/utility/enumerationhelpers.h" #include "gromacs/utility/real.h" // This file with constants is separate from this file to be able @@ -56,6 +57,11 @@ struct NbnxnPairlistCpuWork; struct NbnxnPairlistGpuWork; struct tMPI_Atomic; +namespace Nbnxm +{ +enum class KernelType; +} + /* Convenience type for vector with aligned memory */ template using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>; @@ -64,6 +70,18 @@ using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>; template using FastVector = std::vector < T, gmx::DefaultInitializationAllocator < T>>; +enum class PairlistType : int +{ + Simple4x2, + Simple4x4, + Simple4x8, + Hierarchical8x8, + Count +}; + +static constexpr gmx::EnumerationArray IClusterSizePerListType = { 4, 4, 4, 8 }; +static constexpr gmx::EnumerationArray JClusterSizePerListType = { 2, 4, 8, 8 }; + /*! \cond INTERNAL */ /*! \brief The setup for generating and pruning the nbnxn pair list. @@ -74,20 +92,15 @@ struct NbnxnListParameters { /*! \brief Constructor producing a struct with dynamic pruning disabled */ - NbnxnListParameters(real rlist) : - useDynamicPruning(false), - nstlistPrune(-1), - rlistOuter(rlist), - rlistInner(rlist), - numRollingParts(1) - { - } - - bool useDynamicPruning; //!< Are we using dynamic pair-list pruning - int nstlistPrune; //!< Pair-list dynamic pruning interval - real rlistOuter; //!< Cut-off of the larger, outer pair-list - real rlistInner; //!< Cut-off of the smaller, inner pair-list - int numRollingParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning + NbnxnListParameters(Nbnxm::KernelType kernelType, + real rlist); + + PairlistType pairlistType; //!< The type of cluster-pair list + bool useDynamicPruning; //!< Are we using dynamic pair-list pruning + int nstlistPrune; //!< Pair-list dynamic pruning interval + real rlistOuter; //!< Cut-off of the larger, outer pair-list + real rlistInner; //!< Cut-off of the smaller, inner pair-list + int numRollingParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning }; /*! \endcond */ @@ -292,18 +305,23 @@ struct NbnxnPairlistGpu struct nbnxn_pairlist_set_t { - int nnbl; /* number of lists */ - NbnxnPairlistCpu **nbl; /* lists for CPU */ - NbnxnPairlistCpu **nbl_work; /* work space for rebalancing lists */ - NbnxnPairlistGpu **nblGpu; /* lists for GPU */ - gmx_bool bCombined; /* TRUE if lists get combined into one (the 1st) */ - gmx_bool bSimple; /* TRUE if the list of of type "simple" + nbnxn_pairlist_set_t(const NbnxnListParameters &listParams); + + int nnbl; /* number of lists */ + NbnxnPairlistCpu **nbl; /* lists for CPU */ + NbnxnPairlistCpu **nbl_work; /* work space for rebalancing lists */ + NbnxnPairlistGpu **nblGpu; /* lists for GPU */ + const NbnxnListParameters ¶ms; /* Pairlist parameters desribing setup and ranges */ + gmx_bool bCombined; /* TRUE if lists get combined into one (the 1st) */ + gmx_bool bSimple; /* TRUE if the list of of type "simple" (na_sc=na_s, no super-clusters used) */ - int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */ - int natpair_lj; /* Total number of atom pairs for LJ kernel */ - int natpair_q; /* Total number of atom pairs for Q kernel */ - t_nblist **nbl_fep; /* List of free-energy atom pair interactions */ - int64_t outerListCreationStep; /* Step at which the outer list was created */ + + /* Counts for debug printing */ + int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */ + int natpair_lj; /* Total number of atom pairs for LJ kernel */ + int natpair_q; /* Total number of atom pairs for Q kernel */ + std::vector nbl_fep; /* List of free-energy atom pair interactions */ + int64_t outerListCreationStep; /* Step at which the outer list was created */ }; enum { @@ -315,12 +333,12 @@ struct nbnxn_atomdata_output_t { /* Constructor * - * \param[in] nb_kernel_type Type of non-bonded kernel + * \param[in] kernelType Type of non-bonded kernel * \param[in] numEnergyGroups The number of energy groups * \param[in] simdEnergyBufferStride Stride for entries in the energy buffers for SIMD kernels * \param[in] pinningPolicy Sets the pinning policy for all buffers used on the GPU */ - nbnxn_atomdata_output_t(int nb_kernel_type, + nbnxn_atomdata_output_t(Nbnxm::KernelType kernelType, int numEnergyGroups, int simdEnergyBUfferStride, gmx::PinningPolicy pinningPolicy); diff --git a/src/gromacs/nbnxm/pairlist_tuning.cpp b/src/gromacs/nbnxm/pairlist_tuning.cpp index a8924efefc..d3ee3bc2b3 100644 --- a/src/gromacs/nbnxm/pairlist_tuning.cpp +++ b/src/gromacs/nbnxm/pairlist_tuning.cpp @@ -483,7 +483,6 @@ void setupDynamicPairlistPruning(const gmx::MDLogger &mdlog, const t_inputrec *ir, const gmx_mtop_t *mtop, matrix box, - int nbnxnKernelType, const interaction_const_t *ic, NbnxnListParameters *listParams) { @@ -492,10 +491,14 @@ void setupDynamicPairlistPruning(const gmx::MDLogger &mdlog, /* Initialize the parameters to no dynamic list pruning */ listParams->useDynamicPruning = false; - const VerletbufListSetup ls = verletbufGetListSetup(nbnxnKernelType); + const VerletbufListSetup ls = + { + IClusterSizePerListType[listParams->pairlistType], + JClusterSizePerListType[listParams->pairlistType] + }; /* Currently emulation mode does not support dual pair-lists */ - const bool useGpu = (nbnxnKernelType == nbnxnk8x8x8_GPU); + const bool useGpu = (listParams->pairlistType == PairlistType::Hierarchical8x8); if (supportsDynamicPairlistGenerationInterval(*ir) && getenv("GMX_DISABLE_DYNAMICPRUNING") == nullptr) diff --git a/src/gromacs/nbnxm/pairlist_tuning.h b/src/gromacs/nbnxm/pairlist_tuning.h index a6f82fcddf..3c2227dea9 100644 --- a/src/gromacs/nbnxm/pairlist_tuning.h +++ b/src/gromacs/nbnxm/pairlist_tuning.h @@ -86,7 +86,6 @@ void increaseNstlist(FILE *fplog, t_commrec *cr, * \param[in] ir The input parameter record * \param[in] mtop The global topology * \param[in] box The unit cell - * \param[in] nbnxnKernelType The type of nbnxn kernel used * \param[in] ic The nonbonded interactions constants * \param[in,out] listParams The list setup parameters */ @@ -94,7 +93,6 @@ void setupDynamicPairlistPruning(const gmx::MDLogger &mdlog, const t_inputrec *ir, const gmx_mtop_t *mtop, matrix box, - int nbnxnKernelType, const interaction_const_t *ic, NbnxnListParameters *listParams); diff --git a/src/gromacs/nbnxm/pairlistset.cpp b/src/gromacs/nbnxm/pairlistset.cpp index 8454b4921c..18b0b12a5c 100644 --- a/src/gromacs/nbnxm/pairlistset.cpp +++ b/src/gromacs/nbnxm/pairlistset.cpp @@ -33,16 +33,68 @@ * the research papers on the package. Check out http://www.gromacs.org. */ +/*! \internal \file + * \brief + * Implements functionality for nbnxn_pairlist_set_t. + * + * \author Berk Hess + * \ingroup module_nbnxm + */ + #include "gmxpre.h" +#include "pairlistset.h" + #include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/nbnxm_geometry.h" #include "gromacs/nbnxm/pairlist.h" +#include "gromacs/utility/gmxassert.h" + +/*! \cond INTERNAL */ + +NbnxnListParameters::NbnxnListParameters(const Nbnxm::KernelType kernelType, + const real rlist) : + useDynamicPruning(false), + nstlistPrune(-1), + rlistOuter(rlist), + rlistInner(rlist), + numRollingParts(1) +{ + if (!Nbnxm::kernelTypeUsesSimplePairlist(kernelType)) + { + pairlistType = PairlistType::Hierarchical8x8; + } + else + { + switch (Nbnxm::JClusterSizePerKernelType[kernelType]) + { + case 2: + pairlistType = PairlistType::Simple4x2; + break; + case 4: + pairlistType = PairlistType::Simple4x4; + break; + case 8: + pairlistType = PairlistType::Simple4x8; + break; + default: + GMX_RELEASE_ASSERT(false, "Kernel type does not have a pairlist type"); + } + } +} + +nbnxn_pairlist_set_t::nbnxn_pairlist_set_t(const NbnxnListParameters &listParams) : + params(listParams) +{ + // TODO move this into this constructor + nbnxn_init_pairlist_set(this); +} int nbnxnNumStepsWithPairlist(const nonbonded_verlet_t &nbv, const Nbnxm::InteractionLocality iLocality, const int64_t step) { - return step - nbv.pairlistSets[iLocality].outerListCreationStep; + return step - nbv.pairlistSet(iLocality).outerListCreationStep; } bool nbnxnIsDynamicPairlistPruningStep(const nonbonded_verlet_t &nbv, @@ -51,3 +103,5 @@ bool nbnxnIsDynamicPairlistPruningStep(const nonbonded_verlet_t &nbv, { return nbnxnNumStepsWithPairlist(nbv, iLocality, step) % nbv.listParams->nstlistPrune == 0; } + +/*! \endcond */ diff --git a/src/gromacs/nbnxm/pairlistset.h b/src/gromacs/nbnxm/pairlistset.h index b366b79730..e2da79e853 100644 --- a/src/gromacs/nbnxm/pairlistset.h +++ b/src/gromacs/nbnxm/pairlistset.h @@ -68,9 +68,11 @@ nbnxn_search *nbnxn_init_search(int ePBC, gmx_bool bFEP, int nthread_max); -/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */ -void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, - gmx_bool simple, gmx_bool combined); +/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t + * + * TODO: Merge into the constructor + */ +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list); /*! \brief Prepare the list-set produced by the search for dynamic pruning * diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp index 38d62fad5c..8caddc494e 100644 --- a/src/gromacs/nbnxm/prunekerneldispatch.cpp +++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp @@ -45,13 +45,12 @@ #include "kernels_simd_4xm/kernel_prune.h" -void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, - const Nbnxm::InteractionLocality ilocality, - const rvec *shift_vec) +void NbnxnDispatchPruneKernel(nbnxn_pairlist_set_t *nbl_lists, + const Nbnxm::KernelType kernelType, + const nbnxn_atomdata_t *nbat, + const rvec *shift_vec) { - nbnxn_pairlist_set_t *nbl_lists = &nbv->pairlistSets[ilocality]; - const nbnxn_atomdata_t *nbat = nbv->nbat; - const real rlistInner = nbv->listParams->rlistInner; + const real rlistInner = nbl_lists->params.rlistInner; GMX_ASSERT(nbl_lists->nbl[0]->ciOuter.size() >= nbl_lists->nbl[0]->ci.size(), "Here we should either have an empty ci list or ciOuter should be >= ci"); @@ -62,15 +61,15 @@ void NbnxnDispatchPruneKernel(nonbonded_verlet_t *nbv, { NbnxnPairlistCpu *nbl = nbl_lists->nbl[i]; - switch (nbv->kernelType_) + switch (kernelType) { - case nbnxnk4xN_SIMD_4xN: + case Nbnxm::KernelType::Cpu4xN_Simd_4xN: nbnxn_kernel_prune_4xn(nbl, nbat, shift_vec, rlistInner); break; - case nbnxnk4xN_SIMD_2xNN: + case Nbnxm::KernelType::Cpu4xN_Simd_2xNN: nbnxn_kernel_prune_2xnn(nbl, nbat, shift_vec, rlistInner); break; - case nbnxnk4x4_PlainC: + case Nbnxm::KernelType::Cpu4x4_PlainC: nbnxn_kernel_prune_ref(nbl, nbat, shift_vec, rlistInner); break; default: diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 09f4ebc27c..fbb3683047 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -60,7 +60,6 @@ #include "gromacs/mdtypes/commrec.h" #include "gromacs/mdtypes/inputrec.h" #include "gromacs/mdtypes/md_enums.h" -#include "gromacs/nbnxm/nbnxm.h" #include "gromacs/taskassignment/taskassignment.h" #include "gromacs/topology/topology.h" #include "gromacs/utility/baseversion.h" diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index 02dcac5512..df0a50d231 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -49,8 +49,6 @@ struct gmx_hw_info_t; struct gmx_mtop_t; struct t_inputrec; -enum class EmulateGpuNonbonded : bool; - namespace gmx { @@ -62,6 +60,15 @@ enum class TaskTarget : int Gpu }; +//! Help pass GPU-emulation parameters with type safety. +enum class EmulateGpuNonbonded : bool +{ + //! Do not emulate GPUs. + No, + //! Do emulate GPUs. + Yes +}; + /*! \brief Decide whether this thread-MPI simulation will run * nonbonded tasks on GPUs. * -- 2.22.0