Move the ownership of the xUpdatedOnDevice event to update constraints
[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu_impl_gpu.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Definitions of interfaces for GPU state data propagator object.
38  *
39  * \author Artem Zhmurov <zhmurov@gmail.com>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "config.h"
46
47 #if GMX_GPU
48
49 #    include "gromacs/gpu_utils/device_stream_manager.h"
50 #    include "gromacs/gpu_utils/devicebuffer.h"
51 #    include "gromacs/gpu_utils/gpueventsynchronizer.h"
52 #    include "gromacs/math/vectypes.h"
53 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
54 #    include "gromacs/timing/wallcycle.h"
55 #    include "gromacs/utility/classhelpers.h"
56
57 #    include "state_propagator_data_gpu_impl.h"
58
59
60 namespace gmx
61 {
62
63 StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
64                                    GpuApiCallBehavior         transferKind,
65                                    int                        allocationBlockSizeDivisor,
66                                    gmx_wallcycle*             wcycle) :
67     deviceContext_(deviceStreamManager.context()),
68     transferKind_(transferKind),
69     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
70     wcycle_(wcycle)
71 {
72     static_assert(
73             GMX_GPU,
74             "GPU state propagator data object should only be constructed on the GPU code-paths.");
75
76     // We need to keep local copies for re-initialization.
77     pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
78     localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
79     nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
80     // PME stream is used in OpenCL for H2D coordinate transfer
81     updateStream_ = &deviceStreamManager.stream(
82             GMX_GPU_OPENCL ? DeviceStreamType::Pme : DeviceStreamType::UpdateAndConstraints);
83
84     // Map the atom locality to the stream that will be used for coordinates,
85     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
86     // Note, that nullptr stream is used here to indicate that the copy is not supported.
87     xCopyStreams_[AtomLocality::Local]    = updateStream_;
88     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
89     xCopyStreams_[AtomLocality::All]      = nullptr;
90
91     vCopyStreams_[AtomLocality::Local]    = updateStream_;
92     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
93     vCopyStreams_[AtomLocality::All]      = nullptr;
94
95     fCopyStreams_[AtomLocality::Local]    = localStream_;
96     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
97     fCopyStreams_[AtomLocality::All]      = updateStream_;
98 }
99
100 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
101                                    const DeviceContext& deviceContext,
102                                    GpuApiCallBehavior   transferKind,
103                                    int                  allocationBlockSizeDivisor,
104                                    gmx_wallcycle*       wcycle) :
105     deviceContext_(deviceContext),
106     transferKind_(transferKind),
107     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
108     wcycle_(wcycle)
109 {
110     static_assert(
111             GMX_GPU,
112             "GPU state propagator data object should only be constructed on the GPU code-paths.");
113
114     GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
115     pmeStream_      = pmeStream;
116     localStream_    = pmeStream; // For clearing the force buffer
117     nonLocalStream_ = nullptr;
118     updateStream_   = nullptr;
119
120
121     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
122     // This it temporary measure to make it safe to use this class in those cases.
123     xCopyStreams_[AtomLocality::Local]    = pmeStream_;
124     xCopyStreams_[AtomLocality::NonLocal] = nullptr;
125     xCopyStreams_[AtomLocality::All]      = nullptr;
126
127     vCopyStreams_[AtomLocality::Local]    = nullptr;
128     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
129     vCopyStreams_[AtomLocality::All]      = nullptr;
130
131     fCopyStreams_[AtomLocality::Local]    = nullptr;
132     fCopyStreams_[AtomLocality::NonLocal] = nullptr;
133     fCopyStreams_[AtomLocality::All]      = nullptr;
134 }
135
136 StatePropagatorDataGpu::Impl::~Impl() {}
137
138 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
139 {
140     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
141     wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
142
143     numAtomsLocal_ = numAtomsLocal;
144     numAtomsAll_   = numAtomsAll;
145
146     int numAtomsPadded;
147     if (allocationBlockSizeDivisor_ > 0)
148     {
149         numAtomsPadded = ((numAtomsAll_ + allocationBlockSizeDivisor_ - 1) / allocationBlockSizeDivisor_)
150                          * allocationBlockSizeDivisor_;
151     }
152     else
153     {
154         numAtomsPadded = numAtomsAll_;
155     }
156
157     reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
158
159     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
160     if (paddingAllocationSize > 0)
161     {
162         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
163         clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
164     }
165
166     reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
167     const int d_fOldCapacity = d_fCapacity_;
168     reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
169
170     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
171     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
172     // since the force buffer ops are not implemented in OpenCL.
173     if (GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
174     {
175         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
176     }
177
178     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
179     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
180 }
181
182 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
183 {
184     int atomsStartAt   = 0;
185     int numAtomsToCopy = 0;
186     switch (atomLocality)
187     {
188         case AtomLocality::All:
189             atomsStartAt   = 0;
190             numAtomsToCopy = numAtomsAll_;
191             break;
192         case AtomLocality::Local:
193             atomsStartAt   = 0;
194             numAtomsToCopy = numAtomsLocal_;
195             break;
196         case AtomLocality::NonLocal:
197             atomsStartAt   = numAtomsLocal_;
198             numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
199             break;
200         default:
201             GMX_RELEASE_ASSERT(false,
202                                "Wrong range of atoms requested in GPU state data manager. Should "
203                                "be All, Local or NonLocal.");
204     }
205     GMX_ASSERT(atomsStartAt >= 0,
206                "The first elemtnt to copy has negative index. Probably, the GPU propagator state "
207                "was not initialized.");
208     GMX_ASSERT(numAtomsToCopy >= 0,
209                "Number of atoms to copy is negative. Probably, the GPU propagator state was not "
210                "initialized.");
211     return std::make_tuple(atomsStartAt, numAtomsToCopy);
212 }
213
214 void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
215                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
216                                                 int                                  dataSize,
217                                                 AtomLocality                         atomLocality,
218                                                 const DeviceStream&                  deviceStream)
219 {
220     GMX_UNUSED_VALUE(dataSize);
221
222     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
223
224     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
225
226     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
227
228     int atomsStartAt, numAtomsToCopy;
229     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
230
231     if (numAtomsToCopy != 0)
232     {
233         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
234                    "The device allocation is smaller than requested copy range.");
235         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
236                    "The host buffer is smaller than the requested copy range.");
237
238         copyToDeviceBuffer(&d_data,
239                            reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
240                            atomsStartAt,
241                            numAtomsToCopy,
242                            deviceStream,
243                            transferKind_,
244                            nullptr);
245     }
246 }
247
248 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
249                                                   DeviceBuffer<RVec>       d_data,
250                                                   int                      dataSize,
251                                                   AtomLocality             atomLocality,
252                                                   const DeviceStream&      deviceStream)
253 {
254     GMX_UNUSED_VALUE(dataSize);
255
256     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
257
258     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
259
260     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
261
262     int atomsStartAt, numAtomsToCopy;
263     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
264
265     if (numAtomsToCopy != 0)
266     {
267         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
268                    "The device allocation is smaller than requested copy range.");
269         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
270                    "The host buffer is smaller than the requested copy range.");
271
272         copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]),
273                              &d_data,
274                              atomsStartAt,
275                              numAtomsToCopy,
276                              deviceStream,
277                              transferKind_,
278                              nullptr);
279     }
280 }
281
282 void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
283                                                  int                 dataSize,
284                                                  AtomLocality        atomLocality,
285                                                  const DeviceStream& deviceStream) const
286 {
287     GMX_UNUSED_VALUE(dataSize);
288
289     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
290
291     GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated.");
292
293     GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality.");
294
295     int atomsStartAt, numAtomsToClear;
296     std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality);
297
298     if (numAtomsToClear != 0)
299     {
300         GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize,
301                    "The device allocation is smaller than requested clear range.");
302
303         clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream);
304     }
305 }
306
307 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
308 {
309     return d_x_;
310 }
311
312 void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
313                                                         AtomLocality atomLocality)
314 {
315     GMX_ASSERT(atomLocality < AtomLocality::All,
316                formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
317                             "coordinate transfers, passed value is \"%s\"",
318                             enumValueToString(atomLocality))
319                        .c_str());
320
321     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
322     GMX_ASSERT(deviceStream != nullptr,
323                "No stream is valid for copying positions with given atom locality.");
324
325     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
326     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
327
328     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
329
330     // markEvent is skipped in OpenCL as:
331     //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
332     //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
333     // TODO: remove this by adding an event-mark free flavor of this function
334     if (GMX_GPU_CUDA)
335     {
336         xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
337     }
338
339     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
340     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
341 }
342
343 GpuEventSynchronizer*
344 StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
345                                                                const SimulationWorkload& simulationWork,
346                                                                const StepWorkload&       stepWork)
347 {
348     // The provider of the coordinates may be different for local atoms. If the update is offloaded
349     // and this is not a neighbor search step, then the consumer needs to wait for the update
350     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
351     // the copy event. Non-local coordinates are always provided by the H2D copy.
352     //
353     // TODO: This should be reconsidered to support the halo exchange.
354     //
355     // In OpenCL no events are used as coordinate sync is not necessary
356     if (GMX_GPU_OPENCL)
357     {
358         return nullptr;
359     }
360     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
361     {
362         GMX_ASSERT(xUpdatedOnDeviceEvent_ != nullptr, "The event synchronizer can not be nullptr.");
363         return xUpdatedOnDeviceEvent_;
364     }
365     else
366     {
367         return &xReadyOnDevice_[atomLocality];
368     }
369 }
370
371 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
372 {
373     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
374     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
375     xReadyOnDevice_[atomLocality].waitForEvent();
376     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
377 }
378
379 void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
380 {
381     GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr.");
382     xUpdatedOnDeviceEvent_ = xUpdatedOnDeviceEvent;
383 }
384
385 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
386 {
387     GMX_ASSERT(atomLocality < AtomLocality::All,
388                formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
389                             "coordinate transfers, passed value is \"%s\"",
390                             enumValueToString(atomLocality))
391                        .c_str());
392     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
393     GMX_ASSERT(deviceStream != nullptr,
394                "No stream is valid for copying positions with given atom locality.");
395
396     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
397     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
398
399     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
400     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
401     xReadyOnHost_[atomLocality].markEvent(*deviceStream);
402
403     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
404     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
405 }
406
407 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
408 {
409     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
410     xReadyOnHost_[atomLocality].waitForEvent();
411     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
412 }
413
414
415 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
416 {
417     return d_v_;
418 }
419
420 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
421                                                        AtomLocality atomLocality)
422 {
423     GMX_ASSERT(atomLocality == AtomLocality::Local,
424                formatString("Wrong atom locality. Only Local is allowed for "
425                             "velocity transfers, passed value is \"%s\"",
426                             enumValueToString(atomLocality))
427                        .c_str());
428     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
429     GMX_ASSERT(deviceStream != nullptr,
430                "No stream is valid for copying velocities with given atom locality.");
431
432     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
433     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
434
435     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
436     /* Not marking the event, because it is not used anywhere.
437      * Since we only use velocities on the device for update, and we launch the copy in
438      * the "update" stream, that should be safe.
439      */
440
441     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
442     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
443 }
444
445 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
446 {
447     GMX_ASSERT(atomLocality == AtomLocality::Local,
448                formatString("Wrong atom locality. Only Local is allowed for "
449                             "velocity transfers, passed value is \"%s\"",
450                             enumValueToString(atomLocality))
451                        .c_str());
452     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
453     GMX_ASSERT(deviceStream != nullptr,
454                "No stream is valid for copying velocities with given atom locality.");
455
456     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
457     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
458
459     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
460     vReadyOnHost_[atomLocality].markEvent(*deviceStream);
461
462     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
463     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
464 }
465
466 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
467 {
468     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
469     vReadyOnHost_[atomLocality].waitForEvent();
470     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
471 }
472
473
474 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
475 {
476     return d_f_;
477 }
478
479 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
480                                                    AtomLocality atomLocality)
481 {
482     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
483     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
484     GMX_ASSERT(deviceStream != nullptr,
485                "No stream is valid for copying forces with given atom locality.");
486
487     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
488     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
489
490     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
491     fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
492
493     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
494     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
495 }
496
497 void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
498 {
499     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
500     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
501     GMX_ASSERT(deviceStream != nullptr,
502                "No stream is valid for clearing forces with given atom locality.");
503
504     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
505     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
506
507     clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
508
509     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
510     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
511 }
512
513 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
514                                                                                 bool useGpuFBufferOps)
515 {
516     if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
517     {
518         return &fReducedOnDevice_;
519     }
520     else
521     {
522         return &fReadyOnDevice_[atomLocality];
523     }
524 }
525
526 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
527 {
528     return &fReducedOnDevice_;
529 }
530
531 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
532 {
533     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
534     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
535     GMX_ASSERT(deviceStream != nullptr,
536                "No stream is valid for copying forces with given atom locality.");
537
538     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
539     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
540
541     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
542     fReadyOnHost_[atomLocality].markEvent(*deviceStream);
543
544     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
545     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
546 }
547
548 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
549 {
550     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
551     fReadyOnHost_[atomLocality].waitForEvent();
552     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
553 }
554
555 const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
556 {
557     return updateStream_;
558 }
559
560 int StatePropagatorDataGpu::Impl::numAtomsLocal() const
561 {
562     return numAtomsLocal_;
563 }
564
565 int StatePropagatorDataGpu::Impl::numAtomsAll() const
566 {
567     return numAtomsAll_;
568 }
569
570
571 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
572                                                GpuApiCallBehavior         transferKind,
573                                                int            allocationBlockSizeDivisor,
574                                                gmx_wallcycle* wcycle) :
575     impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
576 {
577 }
578
579 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
580                                                const DeviceContext& deviceContext,
581                                                GpuApiCallBehavior   transferKind,
582                                                int                  allocationBlockSizeDivisor,
583                                                gmx_wallcycle*       wcycle) :
584     impl_(new Impl(pmeStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
585 {
586 }
587
588 StatePropagatorDataGpu::StatePropagatorDataGpu(StatePropagatorDataGpu&& /* other */) noexcept = default;
589
590 StatePropagatorDataGpu& StatePropagatorDataGpu::operator=(StatePropagatorDataGpu&& /* other */) noexcept = default;
591
592 StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
593
594
595 void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
596 {
597     return impl_->reinit(numAtomsLocal, numAtomsAll);
598 }
599
600 std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) const
601 {
602     return impl_->getAtomRangesFromAtomLocality(atomLocality);
603 }
604
605
606 DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
607 {
608     return impl_->getCoordinates();
609 }
610
611 void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
612                                                   AtomLocality                         atomLocality)
613 {
614     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
615 }
616
617 GpuEventSynchronizer*
618 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
619                                                          const SimulationWorkload& simulationWork,
620                                                          const StepWorkload&       stepWork)
621 {
622     return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
623 }
624
625 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
626 {
627     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
628 }
629
630 void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
631 {
632     impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
633 }
634
635 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
636 {
637     return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
638 }
639
640 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
641 {
642     return impl_->waitCoordinatesReadyOnHost(atomLocality);
643 }
644
645
646 DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
647 {
648     return impl_->getVelocities();
649 }
650
651 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
652                                                  AtomLocality                         atomLocality)
653 {
654     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
655 }
656
657 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v, AtomLocality atomLocality)
658 {
659     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
660 }
661
662 void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
663 {
664     return impl_->waitVelocitiesReadyOnHost(atomLocality);
665 }
666
667
668 DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
669 {
670     return impl_->getForces();
671 }
672
673 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality)
674 {
675     return impl_->copyForcesToGpu(h_f, atomLocality);
676 }
677
678 void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
679 {
680     return impl_->clearForcesOnGpu(atomLocality);
681 }
682
683 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
684                                                                           bool useGpuFBufferOps)
685 {
686     return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
687 }
688
689 GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
690 {
691     return impl_->fReducedOnDevice();
692 }
693
694 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
695 {
696     return impl_->copyForcesFromGpu(h_f, atomLocality);
697 }
698
699 void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
700 {
701     return impl_->waitForcesReadyOnHost(atomLocality);
702 }
703
704
705 const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
706 {
707     return impl_->getUpdateStream();
708 }
709
710 int StatePropagatorDataGpu::numAtomsLocal() const
711 {
712     return impl_->numAtomsLocal();
713 }
714
715 int StatePropagatorDataGpu::numAtomsAll() const
716 {
717     return impl_->numAtomsAll();
718 }
719
720 } // namespace gmx
721
722 #endif // GMX_GPU