Move initiation of local CPU force H2D transfer to producer
[alexxy/gromacs.git] / src / gromacs / nbnxm / nbnxm_gpu_data_mgmt.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 /*! \internal \file
37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
38  *
39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
42  *  \author Szilárd Páll <pall.szilard@gmail.com>
43  *  \author Artem Zhmurov <zhmurov@gmail.com>
44  *
45  *  \ingroup module_nbnxm
46  */
47 #include "gmxpre.h"
48
49 #include "config.h"
50
51 #if GMX_GPU_CUDA
52 #    include "cuda/nbnxm_cuda_types.h"
53 #endif
54
55 #if GMX_GPU_OPENCL
56 #    include "opencl/nbnxm_ocl_types.h"
57 #endif
58
59 #include "nbnxm_gpu_data_mgmt.h"
60
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/nbnxm/gpu_data_mgmt.h"
63 #include "gromacs/timing/gpu_timing.h"
64 #include "gromacs/utility/cstringutil.h"
65
66 #include "nbnxm_gpu.h"
67 #include "pairlistsets.h"
68
69 namespace Nbnxm
70 {
71
72 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
73                                     NBParamGpu*                  nbp,
74                                     const DeviceContext&         deviceContext)
75 {
76     if (nbp->coulomb_tab)
77     {
78         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
79     }
80
81     nbp->coulomb_tab_scale = tables.scale;
82     initParamLookupTable(
83             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
84 }
85
86 void inline printEnvironmentVariableDeprecationMessage(bool               isEnvironmentVariableSet,
87                                                        const std::string& environmentVariableSuffix)
88 {
89     if (isEnvironmentVariableSet)
90     {
91         fprintf(stderr,
92                 "Environment variables GMX_CUDA_%s and GMX_OCL_%s are deprecated and will be\n"
93                 "removed in release 2022, please use GMX_GPU_%s instead.",
94                 environmentVariableSuffix.c_str(),
95                 environmentVariableSuffix.c_str(),
96                 environmentVariableSuffix.c_str());
97     }
98 }
99
100 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
101 {
102     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
103
104     /* Benchmarking/development environment variables to force the use of
105        analytical or tabulated Ewald kernel. */
106
107     // Remove these when old environment variables are deprecated
108     const bool forceAnalyticalEwaldLegacy = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr)
109                                             || (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
110     const bool forceTabulatedEwaldLegacy = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr)
111                                            || (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
112     const bool forceTwinCutoffEwaldLegacy = (getenv("GMX_CUDA_NB_EWALD_TWINCUT") != nullptr)
113                                             || (getenv("GMX_OCL_NB_EWALD_TWINCUT") != nullptr);
114
115     printEnvironmentVariableDeprecationMessage(forceAnalyticalEwaldLegacy, "NB_ANA_EWALD");
116     printEnvironmentVariableDeprecationMessage(forceTabulatedEwaldLegacy, "NB_TAB_EWALD");
117     printEnvironmentVariableDeprecationMessage(forceTwinCutoffEwaldLegacy, "NB_EWALD_TWINCUT");
118
119     const bool forceAnalyticalEwald =
120             (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr) || forceAnalyticalEwaldLegacy;
121     const bool forceTabulatedEwald =
122             (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr) || forceTabulatedEwaldLegacy;
123     const bool forceTwinCutoffEwald =
124             (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr) || forceTwinCutoffEwaldLegacy;
125
126     if (forceAnalyticalEwald && forceTabulatedEwald)
127     {
128         gmx_incons(
129                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
130                 "requested through environment variables.");
131     }
132
133     /* By default, use analytical Ewald
134      * TODO: tabulated does not work in OpenCL, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
135      *
136      */
137     bool bUseAnalyticalEwald = true;
138     if (forceAnalyticalEwald)
139     {
140         if (debug)
141         {
142             fprintf(debug, "Using analytical Ewald GPU kernels\n");
143         }
144     }
145     else if (forceTabulatedEwald)
146     {
147         bUseAnalyticalEwald = false;
148
149         if (debug)
150         {
151             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
152         }
153     }
154
155     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
156        forces it (use it for debugging/benchmarking only). */
157     if (!bTwinCut && !forceTwinCutoffEwald)
158     {
159         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
160     }
161     else
162     {
163         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
164     }
165 }
166
167 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
168 {
169     nbp->ewald_beta        = ic->ewaldcoeff_q;
170     nbp->sh_ewald          = ic->sh_ewald;
171     nbp->epsfac            = ic->epsfac;
172     nbp->two_k_rf          = 2.0 * ic->k_rf;
173     nbp->c_rf              = ic->c_rf;
174     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
175     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
176     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
177     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
178     nbp->useDynamicPruning = listParams.useDynamicPruning;
179
180     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
181     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
182
183     nbp->rvdw_switch      = ic->rvdw_switch;
184     nbp->dispersion_shift = ic->dispersion_shift;
185     nbp->repulsion_shift  = ic->repulsion_shift;
186     nbp->vdw_switch       = ic->vdw_switch;
187 }
188
189 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
190 {
191     if (!nbv || !nbv->useGpu())
192     {
193         return;
194     }
195     NbnxmGpu*   nb  = nbv->gpu_nbv;
196     NBParamGpu* nbp = nb->nbparam;
197
198     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
199
200     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic);
201
202     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
203     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
204 }
205
206 void init_plist(gpu_plist* pl)
207 {
208     /* initialize to nullptr pointers to data that is not allocated here and will
209        need reallocation in nbnxn_gpu_init_pairlist */
210     pl->sci   = nullptr;
211     pl->cj4   = nullptr;
212     pl->imask = nullptr;
213     pl->excl  = nullptr;
214
215     /* size -1 indicates that the respective array hasn't been initialized yet */
216     pl->na_c          = -1;
217     pl->nsci          = -1;
218     pl->sci_nalloc    = -1;
219     pl->ncj4          = -1;
220     pl->cj4_nalloc    = -1;
221     pl->nimask        = -1;
222     pl->imask_nalloc  = -1;
223     pl->nexcl         = -1;
224     pl->excl_nalloc   = -1;
225     pl->haveFreshList = false;
226 }
227
228 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
229 {
230     int i, j;
231
232     t->nb_h2d_t = 0.0;
233     t->nb_d2h_t = 0.0;
234     t->nb_c     = 0;
235     t->pl_h2d_t = 0.0;
236     t->pl_h2d_c = 0;
237     for (i = 0; i < 2; i++)
238     {
239         for (j = 0; j < 2; j++)
240         {
241             t->ktime[i][j].t = 0.0;
242             t->ktime[i][j].c = 0;
243         }
244     }
245     t->pruneTime.c        = 0;
246     t->pruneTime.t        = 0.0;
247     t->dynamicPruneTime.c = 0;
248     t->dynamicPruneTime.t = 0.0;
249 }
250
251 //! This function is documented in the header file
252 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
253 {
254     char sbuf[STRLEN];
255     // Timing accumulation should happen only if there was work to do
256     // because getLastRangeTime() gets skipped with empty lists later
257     // which leads to the counter not being reset.
258     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
259     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
260     gpu_plist*          d_plist      = nb->plist[iloc];
261
262     if (d_plist->na_c < 0)
263     {
264         d_plist->na_c = h_plist->na_ci;
265     }
266     else
267     {
268         if (d_plist->na_c != h_plist->na_ci)
269         {
270             sprintf(sbuf,
271                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
272                     d_plist->na_c,
273                     h_plist->na_ci);
274             gmx_incons(sbuf);
275         }
276     }
277
278     gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
279
280     if (bDoTime)
281     {
282         iTimers.pl_h2d.openTimingRegion(deviceStream);
283         iTimers.didPairlistH2D = true;
284     }
285
286     // TODO most of this function is same in CUDA and OpenCL, move into the header
287     const DeviceContext& deviceContext = *nb->deviceContext_;
288
289     reallocateDeviceBuffer(
290             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
291     copyToDeviceBuffer(&d_plist->sci,
292                        h_plist->sci.data(),
293                        0,
294                        h_plist->sci.size(),
295                        deviceStream,
296                        GpuApiCallBehavior::Async,
297                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
298
299     reallocateDeviceBuffer(
300             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
301     copyToDeviceBuffer(&d_plist->cj4,
302                        h_plist->cj4.data(),
303                        0,
304                        h_plist->cj4.size(),
305                        deviceStream,
306                        GpuApiCallBehavior::Async,
307                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
308
309     reallocateDeviceBuffer(&d_plist->imask,
310                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
311                            &d_plist->nimask,
312                            &d_plist->imask_nalloc,
313                            deviceContext);
314
315     reallocateDeviceBuffer(
316             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
317     copyToDeviceBuffer(&d_plist->excl,
318                        h_plist->excl.data(),
319                        0,
320                        h_plist->excl.size(),
321                        deviceStream,
322                        GpuApiCallBehavior::Async,
323                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
324
325     if (bDoTime)
326     {
327         iTimers.pl_h2d.closeTimingRegion(deviceStream);
328     }
329
330     /* need to prune the pair list during the next step */
331     d_plist->haveFreshList = true;
332 }
333
334 //! This function is documented in the header file
335 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
336 {
337     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
338 }
339
340 //! This function is documented in the header file
341 void gpu_reset_timings(nonbonded_verlet_t* nbv)
342 {
343     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
344     {
345         init_timings(nbv->gpu_nbv->timings);
346     }
347 }
348
349 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
350 {
351     return ((nb->nbparam->elecType == ElecType::EwaldAna)
352             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
353 }
354
355 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic)
356 {
357     if (ic->eeltype == eelCUT)
358     {
359         return ElecType::Cut;
360     }
361     else if (EEL_RF(ic->eeltype))
362     {
363         return ElecType::RF;
364     }
365     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
366     {
367         return nbnxn_gpu_pick_ewald_kernel_type(*ic);
368     }
369     else
370     {
371         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
372         GMX_THROW(gmx::InconsistentInputError(
373                 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
374                                   "the GPU accelerated kernels!",
375                                   EELTYPE(ic->eeltype),
376                                   ic->eeltype)));
377     }
378 }
379
380
381 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, int combRule)
382 {
383     if (ic->vdwtype == evdwCUT)
384     {
385         switch (ic->vdw_modifier)
386         {
387             case eintmodNONE:
388             case eintmodPOTSHIFT:
389                 switch (combRule)
390                 {
391                     case ljcrNONE: return VdwType::Cut;
392                     case ljcrGEOM: return VdwType::CutCombGeom;
393                     case ljcrLB: return VdwType::CutCombLB;
394                     default:
395                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
396                                 "The requested LJ combination rule %s (%d) is not implemented in "
397                                 "the GPU accelerated kernels!",
398                                 enum_name(combRule, ljcrNR, c_ljcrNames),
399                                 combRule)));
400                 }
401             case eintmodFORCESWITCH: return VdwType::FSwitch;
402             case eintmodPOTSWITCH: return VdwType::PSwitch;
403             default:
404                 GMX_THROW(gmx::InconsistentInputError(
405                         gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
406                                           "implemented in the GPU accelerated kernels!",
407                                           INTMODIFIER(ic->vdw_modifier),
408                                           ic->vdw_modifier)));
409         }
410     }
411     else if (ic->vdwtype == evdwPME)
412     {
413         if (ic->ljpme_comb_rule == ljcrGEOM)
414         {
415             assert(combRule == ljcrGEOM);
416             return VdwType::EwaldGeom;
417         }
418         else
419         {
420             assert(combRule == ljcrLB);
421             return VdwType::EwaldLB;
422         }
423     }
424     else
425     {
426         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
427                 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
428                 EVDWTYPE(ic->vdwtype),
429                 ic->vdwtype)));
430     }
431 }
432
433 } // namespace Nbnxm