Merge branch release-2021 into master
[alexxy/gromacs.git] / src / gromacs / nbnxm / nbnxm_gpu_data_mgmt.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 /*! \internal \file
37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
38  *
39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
42  *  \author Szilárd Páll <pall.szilard@gmail.com>
43  *  \author Artem Zhmurov <zhmurov@gmail.com>
44  *
45  *  \ingroup module_nbnxm
46  */
47 #include "gmxpre.h"
48
49 #include "config.h"
50
51 #if GMX_GPU_CUDA
52 #    include "cuda/nbnxm_cuda_types.h"
53 #endif
54
55 #if GMX_GPU_OPENCL
56 #    include "opencl/nbnxm_ocl_types.h"
57 #endif
58
59 #include "nbnxm_gpu_data_mgmt.h"
60
61 #include "gromacs/hardware/device_information.h"
62 #include "gromacs/mdtypes/interaction_const.h"
63 #include "gromacs/nbnxm/gpu_data_mgmt.h"
64 #include "gromacs/timing/gpu_timing.h"
65 #include "gromacs/utility/cstringutil.h"
66
67 #include "nbnxm_gpu.h"
68 #include "pairlistsets.h"
69
70 namespace Nbnxm
71 {
72
73 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
74                                     NBParamGpu*                  nbp,
75                                     const DeviceContext&         deviceContext)
76 {
77     if (nbp->coulomb_tab)
78     {
79         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
80     }
81
82     nbp->coulomb_tab_scale = tables.scale;
83     initParamLookupTable(
84             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
85 }
86
87 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
88                                                const DeviceInformation gmx_unused& deviceInfo)
89 {
90     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
91
92     /* Benchmarking/development environment variables to force the use of
93        analytical or tabulated Ewald kernel. */
94     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
95     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
96     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
97
98     if (forceAnalyticalEwald && forceTabulatedEwald)
99     {
100         gmx_incons(
101                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
102                 "requested through environment variables.");
103     }
104
105     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
106      */
107     const bool c_useTabulatedEwaldDefault =
108 #if GMX_GPU_CUDA
109             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
110             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
111 #else
112             false;
113 #endif
114     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
115     if (forceAnalyticalEwald)
116     {
117         bUseAnalyticalEwald = true;
118         if (debug)
119         {
120             fprintf(debug, "Using analytical Ewald GPU kernels\n");
121         }
122     }
123     else if (forceTabulatedEwald)
124     {
125         bUseAnalyticalEwald = false;
126
127         if (debug)
128         {
129             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
130         }
131     }
132
133     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
134        forces it (use it for debugging/benchmarking only). */
135     if (!bTwinCut && !forceTwinCutoffEwald)
136     {
137         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
138     }
139     else
140     {
141         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
142     }
143 }
144
145 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
146 {
147     nbp->ewald_beta        = ic->ewaldcoeff_q;
148     nbp->sh_ewald          = ic->sh_ewald;
149     nbp->epsfac            = ic->epsfac;
150     nbp->two_k_rf          = 2.0 * ic->k_rf;
151     nbp->c_rf              = ic->c_rf;
152     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
153     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
154     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
155     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
156     nbp->useDynamicPruning = listParams.useDynamicPruning;
157
158     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
159     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
160
161     nbp->rvdw_switch      = ic->rvdw_switch;
162     nbp->dispersion_shift = ic->dispersion_shift;
163     nbp->repulsion_shift  = ic->repulsion_shift;
164     nbp->vdw_switch       = ic->vdw_switch;
165 }
166
167 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
168 {
169     if (!nbv || !nbv->useGpu())
170     {
171         return;
172     }
173     NbnxmGpu*   nb  = nbv->gpu_nbv;
174     NBParamGpu* nbp = nb->nbparam;
175
176     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
177
178     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
179
180     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
181     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
182 }
183
184 void init_plist(gpu_plist* pl)
185 {
186     /* initialize to nullptr pointers to data that is not allocated here and will
187        need reallocation in nbnxn_gpu_init_pairlist */
188     pl->sci   = nullptr;
189     pl->cj4   = nullptr;
190     pl->imask = nullptr;
191     pl->excl  = nullptr;
192
193     /* size -1 indicates that the respective array hasn't been initialized yet */
194     pl->na_c                   = -1;
195     pl->nsci                   = -1;
196     pl->sci_nalloc             = -1;
197     pl->ncj4                   = -1;
198     pl->cj4_nalloc             = -1;
199     pl->nimask                 = -1;
200     pl->imask_nalloc           = -1;
201     pl->nexcl                  = -1;
202     pl->excl_nalloc            = -1;
203     pl->haveFreshList          = false;
204     pl->rollingPruningNumParts = 0;
205     pl->rollingPruningPart     = 0;
206 }
207
208 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
209 {
210     t->nb_h2d_t = 0.0;
211     t->nb_d2h_t = 0.0;
212     t->nb_c     = 0;
213     t->pl_h2d_t = 0.0;
214     t->pl_h2d_c = 0;
215     for (int i = 0; i < 2; i++)
216     {
217         for (int j = 0; j < 2; j++)
218         {
219             t->ktime[i][j].t = 0.0;
220             t->ktime[i][j].c = 0;
221         }
222     }
223     t->pruneTime.c        = 0;
224     t->pruneTime.t        = 0.0;
225     t->dynamicPruneTime.c = 0;
226     t->dynamicPruneTime.t = 0.0;
227 }
228
229 //! This function is documented in the header file
230 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
231 {
232     char sbuf[STRLEN];
233     // Timing accumulation should happen only if there was work to do
234     // because getLastRangeTime() gets skipped with empty lists later
235     // which leads to the counter not being reset.
236     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
237     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
238     gpu_plist*          d_plist      = nb->plist[iloc];
239
240     if (d_plist->na_c < 0)
241     {
242         d_plist->na_c = h_plist->na_ci;
243     }
244     else
245     {
246         if (d_plist->na_c != h_plist->na_ci)
247         {
248             sprintf(sbuf,
249                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
250                     d_plist->na_c,
251                     h_plist->na_ci);
252             gmx_incons(sbuf);
253         }
254     }
255
256     gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
257
258     if (bDoTime)
259     {
260         iTimers.pl_h2d.openTimingRegion(deviceStream);
261         iTimers.didPairlistH2D = true;
262     }
263
264     // TODO most of this function is same in CUDA and OpenCL, move into the header
265     const DeviceContext& deviceContext = *nb->deviceContext_;
266
267     reallocateDeviceBuffer(
268             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
269     copyToDeviceBuffer(&d_plist->sci,
270                        h_plist->sci.data(),
271                        0,
272                        h_plist->sci.size(),
273                        deviceStream,
274                        GpuApiCallBehavior::Async,
275                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
276
277     reallocateDeviceBuffer(
278             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
279     copyToDeviceBuffer(&d_plist->cj4,
280                        h_plist->cj4.data(),
281                        0,
282                        h_plist->cj4.size(),
283                        deviceStream,
284                        GpuApiCallBehavior::Async,
285                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
286
287     reallocateDeviceBuffer(&d_plist->imask,
288                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
289                            &d_plist->nimask,
290                            &d_plist->imask_nalloc,
291                            deviceContext);
292
293     reallocateDeviceBuffer(
294             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
295     copyToDeviceBuffer(&d_plist->excl,
296                        h_plist->excl.data(),
297                        0,
298                        h_plist->excl.size(),
299                        deviceStream,
300                        GpuApiCallBehavior::Async,
301                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
302
303     if (bDoTime)
304     {
305         iTimers.pl_h2d.closeTimingRegion(deviceStream);
306     }
307
308     /* need to prune the pair list during the next step */
309     d_plist->haveFreshList = true;
310 }
311
312 //! This function is documented in the header file
313 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
314 {
315     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
316 }
317
318 //! This function is documented in the header file
319 void gpu_reset_timings(nonbonded_verlet_t* nbv)
320 {
321     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
322     {
323         init_timings(nbv->gpu_nbv->timings);
324     }
325 }
326
327 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
328 {
329     return ((nb->nbparam->elecType == ElecType::EwaldAna)
330             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
331 }
332
333 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
334                                                    const DeviceInformation&   deviceInfo)
335 {
336     if (ic->eeltype == eelCUT)
337     {
338         return ElecType::Cut;
339     }
340     else if (EEL_RF(ic->eeltype))
341     {
342         return ElecType::RF;
343     }
344     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
345     {
346         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
347     }
348     else
349     {
350         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
351         GMX_THROW(gmx::InconsistentInputError(
352                 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
353                                   "the GPU accelerated kernels!",
354                                   EELTYPE(ic->eeltype),
355                                   ic->eeltype)));
356     }
357 }
358
359
360 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
361 {
362     if (ic->vdwtype == evdwCUT)
363     {
364         switch (ic->vdw_modifier)
365         {
366             case eintmodNONE:
367             case eintmodPOTSHIFT:
368                 switch (ljCombinationRule)
369                 {
370                     case LJCombinationRule::None: return VdwType::Cut;
371                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
372                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
373                     default:
374                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
375                                 "The requested LJ combination rule %s is not implemented in "
376                                 "the GPU accelerated kernels!",
377                                 enumValueToString(ljCombinationRule))));
378                 }
379             case eintmodFORCESWITCH: return VdwType::FSwitch;
380             case eintmodPOTSWITCH: return VdwType::PSwitch;
381             default:
382                 GMX_THROW(gmx::InconsistentInputError(
383                         gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
384                                           "implemented in the GPU accelerated kernels!",
385                                           INTMODIFIER(ic->vdw_modifier),
386                                           ic->vdw_modifier)));
387         }
388     }
389     else if (ic->vdwtype == evdwPME)
390     {
391         if (ic->ljpme_comb_rule == eljpmeGEOM)
392         {
393             assert(ljCombinationRule == LJCombinationRule::Geometric);
394             return VdwType::EwaldGeom;
395         }
396         else
397         {
398             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
399             return VdwType::EwaldLB;
400         }
401     }
402     else
403     {
404         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
405                 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
406                 EVDWTYPE(ic->vdwtype),
407                 ic->vdwtype)));
408     }
409 }
410
411 } // namespace Nbnxm