2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
59 #include "nbnxm_gpu_data_mgmt.h"
61 #include "gromacs/hardware/device_information.h"
62 #include "gromacs/mdtypes/interaction_const.h"
63 #include "gromacs/nbnxm/gpu_data_mgmt.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/cstringutil.h"
67 #include "nbnxm_gpu.h"
68 #include "pairlistsets.h"
73 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
75 const DeviceContext& deviceContext)
79 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
82 nbp->coulomb_tab_scale = tables.scale;
84 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
87 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
88 const DeviceInformation gmx_unused& deviceInfo)
90 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
92 /* Benchmarking/development environment variables to force the use of
93 analytical or tabulated Ewald kernel. */
94 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
95 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
96 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
98 if (forceAnalyticalEwald && forceTabulatedEwald)
101 "Both analytical and tabulated Ewald GPU non-bonded kernels "
102 "requested through environment variables.");
105 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
107 const bool c_useTabulatedEwaldDefault =
109 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
110 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
114 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
115 if (forceAnalyticalEwald)
117 bUseAnalyticalEwald = true;
120 fprintf(debug, "Using analytical Ewald GPU kernels\n");
123 else if (forceTabulatedEwald)
125 bUseAnalyticalEwald = false;
129 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
133 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
134 forces it (use it for debugging/benchmarking only). */
135 if (!bTwinCut && !forceTwinCutoffEwald)
137 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
141 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
145 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
147 nbp->ewald_beta = ic->ewaldcoeff_q;
148 nbp->sh_ewald = ic->sh_ewald;
149 nbp->epsfac = ic->epsfac;
150 nbp->two_k_rf = 2.0 * ic->k_rf;
151 nbp->c_rf = ic->c_rf;
152 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
153 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
154 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
155 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
156 nbp->useDynamicPruning = listParams.useDynamicPruning;
158 nbp->sh_lj_ewald = ic->sh_lj_ewald;
159 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
161 nbp->rvdw_switch = ic->rvdw_switch;
162 nbp->dispersion_shift = ic->dispersion_shift;
163 nbp->repulsion_shift = ic->repulsion_shift;
164 nbp->vdw_switch = ic->vdw_switch;
167 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
169 if (!nbv || !nbv->useGpu())
173 NbnxmGpu* nb = nbv->gpu_nbv;
174 NBParamGpu* nbp = nb->nbparam;
176 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
178 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
180 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
181 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
184 void init_plist(gpu_plist* pl)
186 /* initialize to nullptr pointers to data that is not allocated here and will
187 need reallocation in nbnxn_gpu_init_pairlist */
193 /* size -1 indicates that the respective array hasn't been initialized yet */
200 pl->imask_nalloc = -1;
202 pl->excl_nalloc = -1;
203 pl->haveFreshList = false;
204 pl->rollingPruningNumParts = 0;
205 pl->rollingPruningPart = 0;
208 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
215 for (int i = 0; i < 2; i++)
217 for (int j = 0; j < 2; j++)
219 t->ktime[i][j].t = 0.0;
220 t->ktime[i][j].c = 0;
224 t->pruneTime.t = 0.0;
225 t->dynamicPruneTime.c = 0;
226 t->dynamicPruneTime.t = 0.0;
229 //! This function is documented in the header file
230 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
233 // Timing accumulation should happen only if there was work to do
234 // because getLastRangeTime() gets skipped with empty lists later
235 // which leads to the counter not being reset.
236 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
237 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
238 gpu_plist* d_plist = nb->plist[iloc];
240 if (d_plist->na_c < 0)
242 d_plist->na_c = h_plist->na_ci;
246 if (d_plist->na_c != h_plist->na_ci)
249 "In init_plist: the #atoms per cell has changed (from %d to %d)",
256 gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
260 iTimers.pl_h2d.openTimingRegion(deviceStream);
261 iTimers.didPairlistH2D = true;
264 // TODO most of this function is same in CUDA and OpenCL, move into the header
265 const DeviceContext& deviceContext = *nb->deviceContext_;
267 reallocateDeviceBuffer(
268 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
269 copyToDeviceBuffer(&d_plist->sci,
274 GpuApiCallBehavior::Async,
275 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
277 reallocateDeviceBuffer(
278 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
279 copyToDeviceBuffer(&d_plist->cj4,
284 GpuApiCallBehavior::Async,
285 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
287 reallocateDeviceBuffer(&d_plist->imask,
288 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
290 &d_plist->imask_nalloc,
293 reallocateDeviceBuffer(
294 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
295 copyToDeviceBuffer(&d_plist->excl,
296 h_plist->excl.data(),
298 h_plist->excl.size(),
300 GpuApiCallBehavior::Async,
301 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
305 iTimers.pl_h2d.closeTimingRegion(deviceStream);
308 /* need to prune the pair list during the next step */
309 d_plist->haveFreshList = true;
312 //! This function is documented in the header file
313 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
315 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
318 //! This function is documented in the header file
319 void gpu_reset_timings(nonbonded_verlet_t* nbv)
321 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
323 init_timings(nbv->gpu_nbv->timings);
327 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
329 return ((nb->nbparam->elecType == ElecType::EwaldAna)
330 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
333 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
334 const DeviceInformation& deviceInfo)
336 if (ic->eeltype == eelCUT)
338 return ElecType::Cut;
340 else if (EEL_RF(ic->eeltype))
344 else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
346 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
350 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
351 GMX_THROW(gmx::InconsistentInputError(
352 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
353 "the GPU accelerated kernels!",
354 EELTYPE(ic->eeltype),
360 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
362 if (ic->vdwtype == evdwCUT)
364 switch (ic->vdw_modifier)
367 case eintmodPOTSHIFT:
368 switch (ljCombinationRule)
370 case LJCombinationRule::None: return VdwType::Cut;
371 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
372 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
374 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
375 "The requested LJ combination rule %s is not implemented in "
376 "the GPU accelerated kernels!",
377 enumValueToString(ljCombinationRule))));
379 case eintmodFORCESWITCH: return VdwType::FSwitch;
380 case eintmodPOTSWITCH: return VdwType::PSwitch;
382 GMX_THROW(gmx::InconsistentInputError(
383 gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
384 "implemented in the GPU accelerated kernels!",
385 INTMODIFIER(ic->vdw_modifier),
389 else if (ic->vdwtype == evdwPME)
391 if (ic->ljpme_comb_rule == eljpmeGEOM)
393 assert(ljCombinationRule == LJCombinationRule::Geometric);
394 return VdwType::EwaldGeom;
398 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
399 return VdwType::EwaldLB;
404 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
405 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
406 EVDWTYPE(ic->vdwtype),