2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * Stubs of functions that must be defined by nbnxm sycl implementation.
40 * \ingroup module_nbnxm
44 #include "gromacs/gpu_utils/device_stream_manager.h"
45 #include "gromacs/hardware/device_information.h"
46 #include "gromacs/mdtypes/interaction_const.h"
47 #include "gromacs/nbnxm/atomdata.h"
48 #include "gromacs/nbnxm/gpu_data_mgmt.h"
49 #include "gromacs/nbnxm/nbnxm_gpu.h"
50 #include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/exceptions.h"
54 #include "nbnxm_sycl_types.h"
59 //! This function is documented in the header file
60 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
62 NBAtomData* adat = nb->atdat;
63 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
65 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
66 // Clear shift force array and energies if the outputs were used in the current step
69 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
70 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
71 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
75 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
76 static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
78 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
79 NBAtomData* atomdata = nb->atdat;
80 atomdata->numTypes = numTypes;
81 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
82 atomdata->shiftVecUploaded = false;
84 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
85 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
86 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
88 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
89 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
90 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
92 /* initialize to nullptr pointers to data that is not allocated here and will
93 need reallocation in later */
94 atomdata->xq = nullptr;
95 atomdata->f = nullptr;
97 /* size -1 indicates that the respective array hasn't been initialized yet */
98 atomdata->numAtoms = -1;
99 atomdata->numAtomsAlloc = -1;
102 /*! \brief Initialize the nonbonded parameter data structure. */
103 static void initNbparam(NBParamGpu* nbp,
104 const interaction_const_t& ic,
105 const PairlistParams& listParams,
106 const nbnxn_atomdata_t::Params& nbatParams,
107 const DeviceContext& deviceContext)
109 const int numTypes = nbatParams.numTypes;
111 set_cutoff_parameters(nbp, &ic, listParams);
113 nbp->vdwType = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
114 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
116 /* generate table for PME */
117 nbp->coulomb_tab = nullptr;
118 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
120 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
121 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
124 /* set up LJ parameter lookup table */
125 if (!useLjCombRule(nbp->vdwType))
127 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
128 "Mismatch in the size of host / device data types");
129 initParamLookupTable(&nbp->nbfp,
131 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
136 /* set up LJ-PME parameter lookup table */
137 if (ic.vdwtype == VanDerWaalsType::Pme)
139 static_assert(sizeof(decltype(nbp->nbfp_comb))
140 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
141 "Mismatch in the size of host / device data types");
142 initParamLookupTable(&nbp->nbfp_comb,
143 &nbp->nbfp_comb_texobj,
144 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
150 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
151 const interaction_const_t* ic,
152 const PairlistParams& listParams,
153 const nbnxn_atomdata_t* nbat,
154 const bool bLocalAndNonlocal)
156 auto* nb = new NbnxmGpu();
157 nb->deviceContext_ = &deviceStreamManager.context();
158 nb->atdat = new NBAtomData;
159 nb->nbparam = new NBParamGpu;
160 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
161 if (bLocalAndNonlocal)
163 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
166 nb->bUseTwoStreams = bLocalAndNonlocal;
168 nb->timers = nullptr;
169 nb->timings = nullptr;
172 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
173 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
174 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
176 init_plist(nb->plist[InteractionLocality::Local]);
178 /* local/non-local GPU streams */
179 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
180 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
181 nb->deviceStreams[InteractionLocality::Local] =
182 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
183 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
184 // out-of-order. But for the time being, it will be less disruptive to keep them.
185 if (nb->bUseTwoStreams)
187 init_plist(nb->plist[InteractionLocality::NonLocal]);
189 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
190 "Non-local non-bonded stream should be initialized to use GPU for "
191 "non-bonded with domain decomposition.");
192 nb->deviceStreams[InteractionLocality::NonLocal] =
193 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
198 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
199 const DeviceContext& deviceContext = *nb->deviceContext_;
201 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
202 initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
207 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
209 NBAtomData* adat = nb->atdat;
210 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
212 /* only if we have a dynamic box */
213 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
215 GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
216 "Sizes of host- and device-side shift vectors should be the same.");
217 copyToDeviceBuffer(&adat->shiftVec,
218 reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
222 GpuApiCallBehavior::Async,
224 adat->shiftVecUploaded = true;
228 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
230 GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
231 NBAtomData* atdat = nb->atdat;
232 const DeviceContext& deviceContext = *nb->deviceContext_;
233 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
235 int numAtoms = nbat->numAtoms();
236 bool reallocated = false;
237 if (numAtoms > atdat->numAtomsAlloc)
239 int numAlloc = over_alloc_small(numAtoms);
241 /* free up first if the arrays have already been initialized */
242 if (atdat->numAtomsAlloc != -1)
244 freeDeviceBuffer(&atdat->f);
245 freeDeviceBuffer(&atdat->xq);
246 freeDeviceBuffer(&atdat->atomTypes);
247 freeDeviceBuffer(&atdat->ljComb);
250 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
251 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
252 if (useLjCombRule(nb->nbparam->vdwType))
254 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
258 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
261 atdat->numAtomsAlloc = numAlloc;
265 atdat->numAtoms = numAtoms;
266 atdat->numAtomsLocal = nbat->natoms_local;
268 /* need to clear GPU f output if realloc happened */
271 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
274 if (useLjCombRule(nb->nbparam->vdwType))
276 GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2),
277 "Size of the LJ parameters element should be equal to the size of float2.");
278 copyToDeviceBuffer(&atdat->ljComb,
279 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
283 GpuApiCallBehavior::Async,
288 GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
289 "Sizes of host- and device-side atom types should be the same.");
290 copyToDeviceBuffer(&atdat->atomTypes,
291 nbat->params().type.data(),
295 GpuApiCallBehavior::Async,
300 void gpu_free(NbnxmGpu* nb)
307 NBAtomData* atdat = nb->atdat;
308 NBParamGpu* nbparam = nb->nbparam;
310 if ((!nbparam->coulomb_tab)
311 && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
313 destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
316 if (!useLjCombRule(nb->nbparam->vdwType))
318 destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
321 if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
323 destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
327 auto* plist = nb->plist[InteractionLocality::Local];
329 if (nb->bUseTwoStreams)
331 auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
337 nb->nbst.eLJ = nullptr;
339 pfree(nb->nbst.eElec);
340 nb->nbst.eElec = nullptr;
342 pfree(nb->nbst.fShift);
343 nb->nbst.fShift = nullptr;
350 int gpu_min_ci_balanced(NbnxmGpu* nb)
352 // SYCL-TODO: Logic and magic values taken from OpenCL
353 static constexpr unsigned int balancedFactor = 50;
358 const cl::sycl::device device = nb->deviceContext_->deviceInfo().syclDevice;
359 const int numComputeUnits = device.get_info<cl::sycl::info::device::max_compute_units>();
360 return balancedFactor * numComputeUnits;