SYCL NBNXM offload support
[alexxy/gromacs.git] / src / gromacs / nbnxm / sycl / nbnxm_sycl_data_mgmt.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  *  \brief
38  *  Stubs of functions that must be defined by nbnxm sycl implementation.
39  *
40  *  \ingroup module_nbnxm
41  */
42 #include "gmxpre.h"
43
44 #include "gromacs/gpu_utils/device_stream_manager.h"
45 #include "gromacs/hardware/device_information.h"
46 #include "gromacs/mdtypes/interaction_const.h"
47 #include "gromacs/nbnxm/atomdata.h"
48 #include "gromacs/nbnxm/gpu_data_mgmt.h"
49 #include "gromacs/nbnxm/nbnxm_gpu.h"
50 #include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/exceptions.h"
53
54 #include "nbnxm_sycl_types.h"
55
56 namespace Nbnxm
57 {
58
59 //! This function is documented in the header file
60 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
61 {
62     sycl_atomdata_t*    adat        = nb->atdat;
63     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
64     // Clear forces
65     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream);
66     // Clear shift force array and energies if the outputs were used in the current step
67     if (computeVirial)
68     {
69         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
70         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
71         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
72     }
73 }
74
75 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
76 static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
77 {
78     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
79     sycl_atomdata_t*    atomdata    = nb->atdat;
80     atomdata->numTypes              = numTypes;
81     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
82     atomdata->shiftVecUploaded = false;
83
84     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
85     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
86     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
87
88     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
89     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
90     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
91
92     /* initialize to nullptr pointers to data that is not allocated here and will
93        need reallocation in later */
94     atomdata->xq = nullptr;
95     atomdata->f  = nullptr;
96
97     /* size -1 indicates that the respective array hasn't been initialized yet */
98     atomdata->natoms   = -1;
99     atomdata->numAlloc = -1;
100 }
101
102 /*! \brief Initialize the nonbonded parameter data structure. */
103 static void initNbparam(NBParamGpu*                     nbp,
104                         const interaction_const_t&      ic,
105                         const PairlistParams&           listParams,
106                         const nbnxn_atomdata_t::Params& nbatParams,
107                         const DeviceContext&            deviceContext)
108 {
109     const int numTypes = nbatParams.numTypes;
110
111     set_cutoff_parameters(nbp, &ic, listParams);
112
113     nbp->vdwType  = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
114     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
115
116     /* generate table for PME */
117     nbp->coulomb_tab = nullptr;
118     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
119     {
120         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
121         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
122     }
123
124     /* set up LJ parameter lookup table */
125     if (!useLjCombRule(nbp->vdwType))
126     {
127         initParamLookupTable(
128                 &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
129     }
130
131     /* set up LJ-PME parameter lookup table */
132     if (ic.vdwtype == evdwPME)
133     {
134         initParamLookupTable(
135                 &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
136     }
137 }
138
139 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
140                    const interaction_const_t*      ic,
141                    const PairlistParams&           listParams,
142                    const nbnxn_atomdata_t*         nbat,
143                    const bool                      bLocalAndNonlocal)
144 {
145     auto* nb                              = new NbnxmGpu();
146     nb->deviceContext_                    = &deviceStreamManager.context();
147     nb->atdat                             = new sycl_atomdata_t;
148     nb->nbparam                           = new NBParamGpu;
149     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
150     if (bLocalAndNonlocal)
151     {
152         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
153     }
154
155     nb->bUseTwoStreams = bLocalAndNonlocal;
156
157     nb->timers  = nullptr;
158     nb->timings = nullptr;
159
160     /* init nbst */
161     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
162     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
163     pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
164
165     init_plist(nb->plist[InteractionLocality::Local]);
166
167     /* local/non-local GPU streams */
168     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
169                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
170     nb->deviceStreams[InteractionLocality::Local] =
171             &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
172     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
173     // out-of-order. But for the time being, it will be less disruptive to keep them.
174     if (nb->bUseTwoStreams)
175     {
176         init_plist(nb->plist[InteractionLocality::NonLocal]);
177
178         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
179                            "Non-local non-bonded stream should be initialized to use GPU for "
180                            "non-bonded with domain decomposition.");
181         nb->deviceStreams[InteractionLocality::NonLocal] =
182                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
183     }
184
185     nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
186
187     nb->bDoTime = false;
188
189     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
190     const DeviceContext&            deviceContext = *nb->deviceContext_;
191
192     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
193     initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
194
195     return nb;
196 }
197
198 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
199 {
200     sycl_atomdata_t*    adat        = nb->atdat;
201     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
202
203     /* only if we have a dynamic box */
204     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
205     {
206         GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
207                    "Sizes of host- and device-side shift vectors should be the same.");
208         copyToDeviceBuffer(&adat->shiftVec,
209                            reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
210                            0,
211                            SHIFTS,
212                            localStream,
213                            GpuApiCallBehavior::Async,
214                            nullptr);
215         adat->shiftVecUploaded = true;
216     }
217 }
218
219 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
220 {
221     GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
222     sycl_atomdata_t*     atdat         = nb->atdat;
223     const DeviceContext& deviceContext = *nb->deviceContext_;
224     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
225
226     int  numAtoms    = nbat->numAtoms();
227     bool reallocated = false;
228     if (numAtoms > atdat->numAlloc)
229     {
230         int numAlloc = over_alloc_small(numAtoms);
231
232         /* free up first if the arrays have already been initialized */
233         if (atdat->numAlloc != -1)
234         {
235             freeDeviceBuffer(&atdat->f);
236             freeDeviceBuffer(&atdat->xq);
237             freeDeviceBuffer(&atdat->atomTypes);
238             freeDeviceBuffer(&atdat->ljComb);
239         }
240
241         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
242         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
243         if (useLjCombRule(nb->nbparam->vdwType))
244         {
245             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
246         }
247         else
248         {
249             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
250         }
251
252         atdat->numAlloc = numAlloc;
253         reallocated     = true;
254     }
255
256     atdat->natoms       = numAtoms;
257     atdat->natoms_local = nbat->natoms_local;
258
259     /* need to clear GPU f output if realloc happened */
260     if (reallocated)
261     {
262         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream);
263     }
264
265     if (useLjCombRule(nb->nbparam->vdwType))
266     {
267         GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2),
268                    "Size of the LJ parameters element should be equal to the size of float2.");
269         copyToDeviceBuffer(&atdat->ljComb,
270                            reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
271                            0,
272                            numAtoms,
273                            localStream,
274                            GpuApiCallBehavior::Async,
275                            nullptr);
276     }
277     else
278     {
279         GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
280                    "Sizes of host- and device-side atom types should be the same.");
281         copyToDeviceBuffer(&atdat->atomTypes,
282                            nbat->params().type.data(),
283                            0,
284                            numAtoms,
285                            localStream,
286                            GpuApiCallBehavior::Async,
287                            nullptr);
288     }
289 }
290
291 void gpu_free(NbnxmGpu* nb)
292 {
293     if (nb == nullptr)
294     {
295         return;
296     }
297
298     sycl_atomdata_t* atdat   = nb->atdat;
299     NBParamGpu*      nbparam = nb->nbparam;
300
301     if ((!nbparam->coulomb_tab)
302         && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
303     {
304         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
305     }
306
307     if (!useLjCombRule(nb->nbparam->vdwType))
308     {
309         destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
310     }
311
312     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
313     {
314         destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
315     }
316
317     /* Free plist */
318     auto* plist = nb->plist[InteractionLocality::Local];
319     delete plist;
320     if (nb->bUseTwoStreams)
321     {
322         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
323         delete plist_nl;
324     }
325
326     /* Free nbst */
327     pfree(nb->nbst.e_lj);
328     nb->nbst.e_lj = nullptr;
329
330     pfree(nb->nbst.e_el);
331     nb->nbst.e_el = nullptr;
332
333     pfree(nb->nbst.fshift);
334     nb->nbst.fshift = nullptr;
335
336     delete atdat;
337     delete nbparam;
338     delete nb;
339 }
340
341 int gpu_min_ci_balanced(NbnxmGpu* nb)
342 {
343     // SYCL-TODO: Logic and magic values taken from OpenCL
344     static constexpr unsigned int balancedFactor = 50;
345     if (nb == nullptr)
346     {
347         return 0;
348     }
349     const cl::sycl::device device = nb->deviceContext_->deviceInfo().syclDevice;
350     const int numComputeUnits     = device.get_info<cl::sycl::info::device::max_compute_units>();
351     return balancedFactor * numComputeUnits;
352 }
353
354 } // namespace Nbnxm