Unify coordinate copy handling across GPU platforms
[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu_impl_gpu.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Definitions of interfaces for GPU state data propagator object.
38  *
39  * \author Artem Zhmurov <zhmurov@gmail.com>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "config.h"
46
47 #if GMX_GPU
48
49 #    include "gromacs/gpu_utils/device_stream_manager.h"
50 #    include "gromacs/gpu_utils/devicebuffer.h"
51 #    include "gromacs/gpu_utils/gpueventsynchronizer.h"
52 #    include "gromacs/math/vectypes.h"
53 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
54 #    include "gromacs/timing/wallcycle.h"
55 #    include "gromacs/utility/classhelpers.h"
56
57 #    include "state_propagator_data_gpu_impl.h"
58
59
60 namespace gmx
61 {
62
63 StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
64                                    GpuApiCallBehavior         transferKind,
65                                    int                        allocationBlockSizeDivisor,
66                                    gmx_wallcycle*             wcycle) :
67     deviceContext_(deviceStreamManager.context()),
68     transferKind_(transferKind),
69     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
70     wcycle_(wcycle)
71 {
72     static_assert(
73             GMX_GPU,
74             "GPU state propagator data object should only be constructed on the GPU code-paths.");
75
76     // We need to keep local copies for re-initialization.
77     pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
78     localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
79     nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
80     updateStream_   = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
81
82     // Map the atom locality to the stream that will be used for coordinates,
83     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
84     // Note, that nullptr stream is used here to indicate that the copy is not supported.
85     xCopyStreams_[AtomLocality::Local]    = updateStream_;
86     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
87     xCopyStreams_[AtomLocality::All]      = nullptr;
88
89     vCopyStreams_[AtomLocality::Local]    = updateStream_;
90     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
91     vCopyStreams_[AtomLocality::All]      = nullptr;
92
93     fCopyStreams_[AtomLocality::Local]    = localStream_;
94     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
95     fCopyStreams_[AtomLocality::All]      = updateStream_;
96
97     copyInStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
98     memsetStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
99 }
100
101 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
102                                    const DeviceContext& deviceContext,
103                                    GpuApiCallBehavior   transferKind,
104                                    int                  allocationBlockSizeDivisor,
105                                    gmx_wallcycle*       wcycle) :
106     deviceContext_(deviceContext),
107     transferKind_(transferKind),
108     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
109     wcycle_(wcycle)
110 {
111     static_assert(
112             GMX_GPU,
113             "GPU state propagator data object should only be constructed on the GPU code-paths.");
114
115     GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
116     pmeStream_      = pmeStream;
117     localStream_    = pmeStream; // For clearing the force buffer
118     nonLocalStream_ = nullptr;
119     updateStream_   = nullptr;
120
121     isPmeOnly_ = true;
122
123     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
124     // This it temporary measure to make it safe to use this class in those cases.
125     xCopyStreams_[AtomLocality::Local]    = pmeStream_;
126     xCopyStreams_[AtomLocality::NonLocal] = nullptr;
127     xCopyStreams_[AtomLocality::All]      = nullptr;
128
129     vCopyStreams_[AtomLocality::Local]    = nullptr;
130     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
131     vCopyStreams_[AtomLocality::All]      = nullptr;
132
133     fCopyStreams_[AtomLocality::Local]    = nullptr;
134     fCopyStreams_[AtomLocality::NonLocal] = nullptr;
135     fCopyStreams_[AtomLocality::All]      = nullptr;
136 }
137
138 StatePropagatorDataGpu::Impl::~Impl() {}
139
140 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
141 {
142     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
143     wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
144
145     numAtomsLocal_ = numAtomsLocal;
146     numAtomsAll_   = numAtomsAll;
147
148     int numAtomsPadded;
149     if (allocationBlockSizeDivisor_ > 0)
150     {
151         numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
152                          * allocationBlockSizeDivisor_;
153     }
154     else
155     {
156         numAtomsPadded = numAtomsAll_;
157     }
158
159     reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
160
161     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
162     if (paddingAllocationSize > 0)
163     {
164         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
165         clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
166     }
167
168     reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
169     const int d_fOldCapacity = d_fCapacity_;
170     reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
171
172     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
173     // the force accumulation stage before syncing with the local stream. Only done in CUDA and
174     // SYCL, since the force buffer ops are not implemented in OpenCL.
175     if ((bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL)) && d_fCapacity_ != d_fOldCapacity)
176     {
177         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
178     }
179
180     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
181     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
182 }
183
184 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
185 {
186     int atomsStartAt   = 0;
187     int numAtomsToCopy = 0;
188     switch (atomLocality)
189     {
190         case AtomLocality::All:
191             atomsStartAt   = 0;
192             numAtomsToCopy = numAtomsAll_;
193             break;
194         case AtomLocality::Local:
195             atomsStartAt   = 0;
196             numAtomsToCopy = numAtomsLocal_;
197             break;
198         case AtomLocality::NonLocal:
199             atomsStartAt   = numAtomsLocal_;
200             numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
201             break;
202         default:
203             GMX_RELEASE_ASSERT(false,
204                                "Wrong range of atoms requested in GPU state data manager. Should "
205                                "be All, Local or NonLocal.");
206     }
207     GMX_ASSERT(atomsStartAt >= 0,
208                "The first elemtnt to copy has negative index. Probably, the GPU propagator state "
209                "was not initialized.");
210     GMX_ASSERT(numAtomsToCopy >= 0,
211                "Number of atoms to copy is negative. Probably, the GPU propagator state was not "
212                "initialized.");
213     return std::make_tuple(atomsStartAt, numAtomsToCopy);
214 }
215
216 void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
217                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
218                                                 int                                  dataSize,
219                                                 AtomLocality                         atomLocality,
220                                                 const DeviceStream&                  deviceStream)
221 {
222     GMX_UNUSED_VALUE(dataSize);
223
224     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
225
226     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
227
228     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
229
230     int atomsStartAt, numAtomsToCopy;
231     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
232
233     if (numAtomsToCopy != 0)
234     {
235         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
236                    "The device allocation is smaller than requested copy range.");
237         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
238                    "The host buffer is smaller than the requested copy range.");
239
240         copyToDeviceBuffer(&d_data,
241                            reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
242                            atomsStartAt,
243                            numAtomsToCopy,
244                            deviceStream,
245                            transferKind_,
246                            nullptr);
247     }
248 }
249
250 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
251                                                   DeviceBuffer<RVec>       d_data,
252                                                   int                      dataSize,
253                                                   AtomLocality             atomLocality,
254                                                   const DeviceStream&      deviceStream)
255 {
256     GMX_UNUSED_VALUE(dataSize);
257
258     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
259
260     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
261
262     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
263
264     int atomsStartAt, numAtomsToCopy;
265     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
266
267     if (numAtomsToCopy != 0)
268     {
269         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
270                    "The device allocation is smaller than requested copy range.");
271         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
272                    "The host buffer is smaller than the requested copy range.");
273
274         copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]),
275                              &d_data,
276                              atomsStartAt,
277                              numAtomsToCopy,
278                              deviceStream,
279                              transferKind_,
280                              nullptr);
281     }
282 }
283
284 void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
285                                                  int                 dataSize,
286                                                  AtomLocality        atomLocality,
287                                                  const DeviceStream& deviceStream) const
288 {
289     GMX_UNUSED_VALUE(dataSize);
290
291     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
292
293     GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated.");
294
295     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality.");
296
297     int atomsStartAt, numAtomsToClear;
298     std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality);
299
300     if (numAtomsToClear != 0)
301     {
302         GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize,
303                    "The device allocation is smaller than requested clear range.");
304
305         clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream);
306     }
307 }
308
309 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
310 {
311     return d_x_;
312 }
313
314 void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
315                                                         AtomLocality atomLocality)
316 {
317     GMX_ASSERT(atomLocality < AtomLocality::All,
318                formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
319                             "coordinate transfers, passed value is \"%s\"",
320                             enumValueToString(atomLocality))
321                        .c_str());
322
323     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
324     GMX_ASSERT(deviceStream != nullptr,
325                "No stream is valid for copying positions with given atom locality.");
326
327     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
328     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
329
330     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
331
332     // marking is skipped on the PME-rank mode as everything is on the same stream
333     if (!isPmeOnly_)
334     {
335         xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
336     }
337
338     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
339     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
340 }
341
342 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(
343         AtomLocality              atomLocality,
344         const SimulationWorkload& simulationWork,
345         const StepWorkload&       stepWork,
346         GpuEventSynchronizer*     gpuCoordinateHaloLaunched)
347 {
348     // The provider of the coordinates may be different for local atoms. If the update is offloaded
349     // and this is not a neighbor search step, then the consumer needs to wait for the update
350     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
351     // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
352
353     if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
354     {
355         GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
356                    "GPU halo exchange is active but its completion event is null.");
357         return gpuCoordinateHaloLaunched;
358     }
359     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
360     {
361         GMX_ASSERT(xUpdatedOnDeviceEvent_ != nullptr, "The event synchronizer can not be nullptr.");
362         return xUpdatedOnDeviceEvent_;
363     }
364     else
365     {
366         if (stepWork.doNeighborSearch && xUpdatedOnDeviceEvent_)
367         {
368             /* On search steps, we do not consume the result of the GPU update
369              * but rather that of a H2D transfer. So, we reset the event triggered after
370              * update to avoid leaving it unconsumed.
371              * Unfortunately, we don't always have the event marked either (e.g., on the
372              * first step) so we just reset it here.
373              * See Issue #3988. */
374             xUpdatedOnDeviceEvent_->reset();
375         }
376         return &xReadyOnDevice_[atomLocality];
377     }
378 }
379
380 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
381 {
382     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
383     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
384     xReadyOnDevice_[atomLocality].waitForEvent();
385     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
386 }
387
388 void StatePropagatorDataGpu::Impl::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
389 {
390     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
391     xReadyOnDevice_[atomLocality].consume();
392 }
393
394 void StatePropagatorDataGpu::Impl::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
395 {
396     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
397     xReadyOnDevice_[atomLocality].reset();
398 }
399
400 void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
401 {
402     GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr.");
403     xUpdatedOnDeviceEvent_ = xUpdatedOnDeviceEvent;
404 }
405
406 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
407                                                           AtomLocality             atomLocality,
408                                                           GpuEventSynchronizer*    dependency)
409 {
410     GMX_ASSERT(atomLocality < AtomLocality::All,
411                formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
412                             "coordinate transfers, passed value is \"%s\"",
413                             enumValueToString(atomLocality))
414                        .c_str());
415     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
416     GMX_ASSERT(deviceStream != nullptr,
417                "No stream is valid for copying positions with given atom locality.");
418
419     if (dependency != nullptr)
420     {
421         dependency->enqueueWaitEvent(*deviceStream);
422     }
423
424     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
425     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
426
427     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
428     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
429     xReadyOnHost_[atomLocality].markEvent(*deviceStream);
430
431     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
432     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
433 }
434
435 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
436 {
437     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
438     xReadyOnHost_[atomLocality].waitForEvent();
439     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
440 }
441
442
443 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
444 {
445     return d_v_;
446 }
447
448 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
449                                                        AtomLocality atomLocality)
450 {
451     GMX_ASSERT(atomLocality == AtomLocality::Local,
452                formatString("Wrong atom locality. Only Local is allowed for "
453                             "velocity transfers, passed value is \"%s\"",
454                             enumValueToString(atomLocality))
455                        .c_str());
456     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
457     GMX_ASSERT(deviceStream != nullptr,
458                "No stream is valid for copying velocities with given atom locality.");
459
460     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
461     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
462
463     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
464     /* Not marking the event, because it is not used anywhere.
465      * Since we only use velocities on the device for update, and we launch the copy in
466      * the "update" stream, that should be safe.
467      */
468
469     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
470     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
471 }
472
473 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
474 {
475     GMX_ASSERT(atomLocality == AtomLocality::Local,
476                formatString("Wrong atom locality. Only Local is allowed for "
477                             "velocity transfers, passed value is \"%s\"",
478                             enumValueToString(atomLocality))
479                        .c_str());
480     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
481     GMX_ASSERT(deviceStream != nullptr,
482                "No stream is valid for copying velocities with given atom locality.");
483
484     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
485     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
486
487     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
488     vReadyOnHost_[atomLocality].markEvent(*deviceStream);
489
490     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
491     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
492 }
493
494 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
495 {
496     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
497     vReadyOnHost_[atomLocality].waitForEvent();
498     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
499 }
500
501
502 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
503 {
504     return d_f_;
505 }
506
507 // Copy CPU forces to GPU using stream internal to this module to allow overlap
508 // with GPU force calculations.
509 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
510                                                    AtomLocality atomLocality)
511 {
512     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
513     DeviceStream* deviceStream = copyInStream_.get();
514     GMX_ASSERT(deviceStream != nullptr,
515                "No stream is valid for copying forces with given atom locality.");
516
517     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
518     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
519
520     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
521     fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
522
523     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
524     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
525 }
526
527 void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
528 {
529     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
530     DeviceStream* deviceStream = memsetStream_.get();
531
532     GMX_ASSERT(dependency != nullptr, "Dependency is not valid for clearing forces.");
533     dependency->enqueueWaitEvent(*deviceStream);
534
535     GMX_ASSERT(deviceStream != nullptr,
536                "No stream is valid for clearing forces with given atom locality.");
537
538     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
539     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
540
541     clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
542
543     fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
544
545     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
546     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
547 }
548
549 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
550                                                                                      SimulationWorkload simulationWork)
551 {
552     if (stepWork.useGpuFBufferOps && !simulationWork.useCpuPmePpCommunication)
553     {
554         return &fReducedOnDevice_[AtomLocality::Local];
555     }
556     else
557     {
558         return &fReadyOnDevice_[AtomLocality::Local];
559     }
560 }
561
562 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocality atomLocality)
563 {
564     return &fReducedOnDevice_[atomLocality];
565 }
566
567 void StatePropagatorDataGpu::Impl::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
568 {
569     fReducedOnDevice_[atomLocality].consume();
570 }
571
572 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
573 {
574     return &fReadyOnDevice_[atomLocality];
575 }
576
577 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
578 {
579     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
580     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
581     GMX_ASSERT(deviceStream != nullptr,
582                "No stream is valid for copying forces with given atom locality.");
583
584     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
585     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
586
587     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
588     fReadyOnHost_[atomLocality].markEvent(*deviceStream);
589
590     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
591     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
592 }
593
594 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
595 {
596     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
597     fReadyOnHost_[atomLocality].waitForEvent();
598     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
599 }
600
601 const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
602 {
603     return updateStream_;
604 }
605
606 int StatePropagatorDataGpu::Impl::numAtomsLocal() const
607 {
608     return numAtomsLocal_;
609 }
610
611 int StatePropagatorDataGpu::Impl::numAtomsAll() const
612 {
613     return numAtomsAll_;
614 }
615
616
617 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
618                                                GpuApiCallBehavior         transferKind,
619                                                int            allocationBlockSizeDivisor,
620                                                gmx_wallcycle* wcycle) :
621     impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
622 {
623 }
624
625 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
626                                                const DeviceContext& deviceContext,
627                                                GpuApiCallBehavior   transferKind,
628                                                int                  allocationBlockSizeDivisor,
629                                                gmx_wallcycle*       wcycle) :
630     impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
631 {
632 }
633
634 StatePropagatorDataGpu::StatePropagatorDataGpu(StatePropagatorDataGpu&& /* other */) noexcept = default;
635
636 StatePropagatorDataGpu& StatePropagatorDataGpu::operator=(StatePropagatorDataGpu&& /* other */) noexcept = default;
637
638 StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
639
640
641 void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
642 {
643     return impl_->reinit(numAtomsLocal, numAtomsAll);
644 }
645
646 std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
647 {
648     return impl_->getAtomRangesFromAtomLocality(atomLocality);
649 }
650
651
652 DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
653 {
654     return impl_->getCoordinates();
655 }
656
657 void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
658                                                   AtomLocality                         atomLocality)
659 {
660     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
661 }
662
663 GpuEventSynchronizer*
664 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
665                                                          const SimulationWorkload& simulationWork,
666                                                          const StepWorkload&       stepWork,
667                                                          GpuEventSynchronizer* gpuCoordinateHaloLaunched)
668 {
669     return impl_->getCoordinatesReadyOnDeviceEvent(
670             atomLocality, simulationWork, stepWork, gpuCoordinateHaloLaunched);
671 }
672
673 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
674 {
675     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
676 }
677
678 void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
679 {
680     return impl_->consumeCoordinatesCopiedToDeviceEvent(atomLocality);
681 }
682
683 void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
684 {
685     return impl_->resetCoordinatesCopiedToDeviceEvent(atomLocality);
686 }
687
688 void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
689 {
690     impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
691 }
692
693 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>   h_x,
694                                                     AtomLocality          atomLocality,
695                                                     GpuEventSynchronizer* dependency)
696 {
697     return impl_->copyCoordinatesFromGpu(h_x, atomLocality, dependency);
698 }
699
700 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
701 {
702     return impl_->waitCoordinatesReadyOnHost(atomLocality);
703 }
704
705
706 DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
707 {
708     return impl_->getVelocities();
709 }
710
711 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
712                                                  AtomLocality                         atomLocality)
713 {
714     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
715 }
716
717 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v, AtomLocality atomLocality)
718 {
719     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
720 }
721
722 void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
723 {
724     return impl_->waitVelocitiesReadyOnHost(atomLocality);
725 }
726
727
728 DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
729 {
730     return impl_->getForces();
731 }
732
733 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality)
734 {
735     return impl_->copyForcesToGpu(h_f, atomLocality);
736 }
737
738 void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
739 {
740     return impl_->clearForcesOnGpu(atomLocality, dependency);
741 }
742
743 GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
744                                                                                SimulationWorkload simulationWork)
745 {
746     return impl_->getLocalForcesReadyOnDeviceEvent(stepWork, simulationWork);
747 }
748
749 GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atomLocality)
750 {
751     return impl_->fReducedOnDevice(atomLocality);
752 }
753
754 void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
755 {
756     impl_->consumeForcesReducedOnDeviceEvent(atomLocality);
757 }
758
759 GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
760 {
761     return impl_->fReadyOnDevice(atomLocality);
762 }
763
764 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
765 {
766     return impl_->copyForcesFromGpu(h_f, atomLocality);
767 }
768
769 void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
770 {
771     return impl_->waitForcesReadyOnHost(atomLocality);
772 }
773
774
775 const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
776 {
777     return impl_->getUpdateStream();
778 }
779
780 int StatePropagatorDataGpu::numAtomsLocal() const
781 {
782     return impl_->numAtomsLocal();
783 }
784
785 int StatePropagatorDataGpu::numAtomsAll() const
786 {
787     return impl_->numAtomsAll();
788 }
789
790 } // namespace gmx
791
792 #endif // GMX_GPU