031327e80d844b1917b4c65c5317e95aac721b56
[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu_impl_gpu.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Definitions of interfaces for GPU state data propagator object.
38  *
39  * \author Artem Zhmurov <zhmurov@gmail.com>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "config.h"
46
47 #if GMX_GPU != GMX_GPU_NONE
48
49 #    if GMX_GPU == GMX_GPU_CUDA
50 #        include "gromacs/gpu_utils/cudautils.cuh"
51 #    endif
52 #    include "gromacs/gpu_utils/devicebuffer.h"
53 #    if GMX_GPU == GMX_GPU_OPENCL
54 #        include "gromacs/gpu_utils/oclutils.h"
55 #    endif
56 #    include "gromacs/math/vectypes.h"
57 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
58 #    include "gromacs/timing/wallcycle.h"
59 #    include "gromacs/utility/classhelpers.h"
60
61 #    include "state_propagator_data_gpu_impl.h"
62
63
64 namespace gmx
65 {
66
67 StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
68                                    const void*        localStream,
69                                    const void*        nonLocalStream,
70                                    const void*        deviceContext,
71                                    GpuApiCallBehavior transferKind,
72                                    int                paddingSize,
73                                    gmx_wallcycle*     wcycle) :
74     transferKind_(transferKind),
75     paddingSize_(paddingSize),
76     wcycle_(wcycle)
77 {
78     static_assert(GMX_GPU != GMX_GPU_NONE,
79                   "This object should only be constructed on the GPU code-paths.");
80
81     // TODO: Refactor when the StreamManager is introduced.
82     if (GMX_GPU == GMX_GPU_OPENCL)
83     {
84         GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
85         GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
86
87         // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
88         pmeStream_     = *static_cast<const CommandStream*>(pmeStream);
89         updateStream_  = *static_cast<const CommandStream*>(pmeStream);
90         deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
91         GMX_UNUSED_VALUE(localStream);
92         GMX_UNUSED_VALUE(nonLocalStream);
93     }
94
95     if (GMX_GPU == GMX_GPU_CUDA)
96     {
97         if (pmeStream != nullptr)
98         {
99             pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
100         }
101         if (localStream != nullptr)
102         {
103             localStream_ = *static_cast<const CommandStream*>(localStream);
104         }
105         if (nonLocalStream != nullptr)
106         {
107             nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
108         }
109
110         // TODO: The update stream should be created only when it is needed.
111 #    if (GMX_GPU == GMX_GPU_CUDA)
112         cudaError_t stat;
113         stat = cudaStreamCreate(&updateStream_);
114         CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
115 #    endif
116         GMX_UNUSED_VALUE(deviceContext);
117     }
118
119     // Map the atom locality to the stream that will be used for coordinates,
120     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
121     // Note, that nullptr stream is used here to indicate that the copy is not supported.
122     xCopyStreams_[AtomLocality::Local]    = updateStream_;
123     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
124     xCopyStreams_[AtomLocality::All]      = updateStream_;
125
126     vCopyStreams_[AtomLocality::Local]    = updateStream_;
127     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
128     vCopyStreams_[AtomLocality::All]      = updateStream_;
129
130     fCopyStreams_[AtomLocality::Local]    = localStream_;
131     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
132     fCopyStreams_[AtomLocality::All]      = updateStream_;
133 }
134
135 StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
136                                    const void*        deviceContext,
137                                    GpuApiCallBehavior transferKind,
138                                    int                paddingSize,
139                                    gmx_wallcycle*     wcycle) :
140     transferKind_(transferKind),
141     paddingSize_(paddingSize),
142     wcycle_(wcycle)
143 {
144     static_assert(GMX_GPU != GMX_GPU_NONE,
145                   "This object should only be constructed on the GPU code-paths.");
146
147     if (GMX_GPU == GMX_GPU_OPENCL)
148     {
149         GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
150         deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
151     }
152
153     GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
154     pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
155
156     localStream_    = nullptr;
157     nonLocalStream_ = nullptr;
158     updateStream_   = nullptr;
159
160
161     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
162     // This it temporary measure to make it safe to use this class in those cases.
163     xCopyStreams_[AtomLocality::Local]    = pmeStream_;
164     xCopyStreams_[AtomLocality::NonLocal] = nullptr;
165     xCopyStreams_[AtomLocality::All]      = pmeStream_;
166
167     vCopyStreams_[AtomLocality::Local]    = nullptr;
168     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
169     vCopyStreams_[AtomLocality::All]      = nullptr;
170
171     fCopyStreams_[AtomLocality::Local]    = nullptr;
172     fCopyStreams_[AtomLocality::NonLocal] = nullptr;
173     fCopyStreams_[AtomLocality::All]      = nullptr;
174 }
175
176 StatePropagatorDataGpu::Impl::~Impl() {}
177
178 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
179 {
180     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
181     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
182
183     numAtomsLocal_ = numAtomsLocal;
184     numAtomsAll_   = numAtomsAll;
185
186     int numAtomsPadded;
187     if (paddingSize_ > 0)
188     {
189         numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
190     }
191     else
192     {
193         numAtomsPadded = numAtomsAll_;
194     }
195
196     reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
197
198     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
199     if (paddingAllocationSize > 0)
200     {
201         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
202         clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_);
203     }
204
205     reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
206     const int d_fOldCapacity = d_fCapacity_;
207     reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
208     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
209     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
210     // since the force buffer ops are not implemented in OpenCL.
211     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
212     {
213         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
214     }
215
216     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
217     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
218 }
219
220 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
221 {
222     int atomsStartAt   = 0;
223     int numAtomsToCopy = 0;
224     switch (atomLocality)
225     {
226         case AtomLocality::All:
227             atomsStartAt   = 0;
228             numAtomsToCopy = numAtomsAll_;
229             break;
230         case AtomLocality::Local:
231             atomsStartAt   = 0;
232             numAtomsToCopy = numAtomsLocal_;
233             break;
234         case AtomLocality::NonLocal:
235             atomsStartAt   = numAtomsLocal_;
236             numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
237             break;
238         default:
239             GMX_RELEASE_ASSERT(false,
240                                "Wrong range of atoms requested in GPU state data manager. Should "
241                                "be All, Local or NonLocal.");
242     }
243     GMX_ASSERT(atomsStartAt >= 0,
244                "The first elemtnt to copy has negative index. Probably, the GPU propagator state "
245                "was not initialized.");
246     GMX_ASSERT(numAtomsToCopy >= 0,
247                "Number of atoms to copy is negative. Probably, the GPU propagator state was not "
248                "initialized.");
249     return std::make_tuple(atomsStartAt, numAtomsToCopy);
250 }
251
252 void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                  d_data,
253                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
254                                                 int                                  dataSize,
255                                                 AtomLocality                         atomLocality,
256                                                 CommandStream                        commandStream)
257 {
258     GMX_UNUSED_VALUE(dataSize);
259
260     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
261
262     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
263
264     GMX_ASSERT(commandStream != nullptr,
265                "No stream is valid for copying with given atom locality.");
266     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
267     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
268
269     int atomsStartAt, numAtomsToCopy;
270     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
271
272     int elementsStartAt   = atomsStartAt * DIM;
273     int numElementsToCopy = numAtomsToCopy * DIM;
274
275     if (numAtomsToCopy != 0)
276     {
277         GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
278                    "The device allocation is smaller than requested copy range.");
279         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
280                    "The host buffer is smaller than the requested copy range.");
281
282         copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
283                            elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
284     }
285
286     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
287     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
288 }
289
290 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
291                                                   DeviceBuffer<float>      d_data,
292                                                   int                      dataSize,
293                                                   AtomLocality             atomLocality,
294                                                   CommandStream            commandStream)
295 {
296     GMX_UNUSED_VALUE(dataSize);
297
298     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
299
300     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
301
302     GMX_ASSERT(commandStream != nullptr,
303                "No stream is valid for copying with given atom locality.");
304     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
305     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
306
307     int atomsStartAt, numAtomsToCopy;
308     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
309
310     int elementsStartAt   = atomsStartAt * DIM;
311     int numElementsToCopy = numAtomsToCopy * DIM;
312
313     if (numAtomsToCopy != 0)
314     {
315         GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
316                    "The device allocation is smaller than requested copy range.");
317         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
318                    "The host buffer is smaller than the requested copy range.");
319
320         copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
321                              elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
322     }
323
324     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
325     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
326 }
327
328 DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
329 {
330     return d_x_;
331 }
332
333 void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
334                                                         AtomLocality atomLocality)
335 {
336     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
337     CommandStream commandStream = xCopyStreams_[atomLocality];
338     GMX_ASSERT(commandStream != nullptr,
339                "No stream is valid for copying positions with given atom locality.");
340
341     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
342     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
343
344     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
345
346     // markEvent is skipped in OpenCL as:
347     //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
348     //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
349     // TODO: remove this by adding an event-mark free flavor of this function
350     if (GMX_GPU == GMX_GPU_CUDA)
351     {
352         xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
353     }
354
355     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
356     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
357 }
358
359 GpuEventSynchronizer*
360 StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
361                                                                const SimulationWorkload& simulationWork,
362                                                                const StepWorkload&       stepWork)
363 {
364     // The provider of the coordinates may be different for local atoms. If the update is offloaded
365     // and this is not a neighbor search step, then the consumer needs to wait for the update
366     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
367     // the copy event. Non-local coordinates are always provided by the H2D copy.
368     //
369     // TODO: This should be reconsidered to support the halo exchange.
370     //
371     // In OpenCL no events are used as coordinate sync is not necessary
372     if (GMX_GPU == GMX_GPU_OPENCL)
373     {
374         return nullptr;
375     }
376     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
377     {
378         return &xUpdatedOnDevice_;
379     }
380     else
381     {
382         return &xReadyOnDevice_[atomLocality];
383     }
384 }
385
386 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
387 {
388     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
389     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
390     xReadyOnDevice_[atomLocality].waitForEvent();
391     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
392 }
393
394 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
395 {
396     return &xUpdatedOnDevice_;
397 }
398
399 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
400 {
401     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
402     CommandStream commandStream = xCopyStreams_[atomLocality];
403     GMX_ASSERT(commandStream != nullptr,
404                "No stream is valid for copying positions with given atom locality.");
405
406     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
407     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
408
409     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
410     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
411     xReadyOnHost_[atomLocality].markEvent(commandStream);
412
413     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
414     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
415 }
416
417 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
418 {
419     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
420     xReadyOnHost_[atomLocality].waitForEvent();
421     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
422 }
423
424
425 DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
426 {
427     return d_v_;
428 }
429
430 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
431                                                        AtomLocality atomLocality)
432 {
433     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
434     CommandStream commandStream = vCopyStreams_[atomLocality];
435     GMX_ASSERT(commandStream != nullptr,
436                "No stream is valid for copying velocities with given atom locality.");
437
438     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
439     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
440
441     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
442     vReadyOnDevice_[atomLocality].markEvent(commandStream);
443
444     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
445     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
446 }
447
448 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
449 {
450     return &vReadyOnDevice_[atomLocality];
451 }
452
453
454 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
455 {
456     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
457     CommandStream commandStream = vCopyStreams_[atomLocality];
458     GMX_ASSERT(commandStream != nullptr,
459                "No stream is valid for copying velocities with given atom locality.");
460
461     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
462     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
463
464     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
465     vReadyOnHost_[atomLocality].markEvent(commandStream);
466
467     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
468     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
469 }
470
471 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
472 {
473     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
474     vReadyOnHost_[atomLocality].waitForEvent();
475     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
476 }
477
478
479 DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
480 {
481     return d_f_;
482 }
483
484 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
485                                                    AtomLocality atomLocality)
486 {
487     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
488     CommandStream commandStream = fCopyStreams_[atomLocality];
489     GMX_ASSERT(commandStream != nullptr,
490                "No stream is valid for copying forces with given atom locality.");
491
492     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
493     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
494
495     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
496     fReadyOnDevice_[atomLocality].markEvent(commandStream);
497
498     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
499     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
500 }
501
502 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
503                                                                                 bool useGpuFBufferOps)
504 {
505     if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
506     {
507         return &fReducedOnDevice_;
508     }
509     else
510     {
511         return &fReadyOnDevice_[atomLocality];
512     }
513 }
514
515 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
516 {
517     return &fReducedOnDevice_;
518 }
519
520 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
521 {
522     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
523     CommandStream commandStream = fCopyStreams_[atomLocality];
524     GMX_ASSERT(commandStream != nullptr,
525                "No stream is valid for copying forces with given atom locality.");
526
527     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
528     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
529
530     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
531     fReadyOnHost_[atomLocality].markEvent(commandStream);
532
533     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
534     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
535 }
536
537 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
538 {
539     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
540     fReadyOnHost_[atomLocality].waitForEvent();
541     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
542 }
543
544 void* StatePropagatorDataGpu::Impl::getUpdateStream()
545 {
546     return &updateStream_;
547 }
548
549 int StatePropagatorDataGpu::Impl::numAtomsLocal()
550 {
551     return numAtomsLocal_;
552 }
553
554 int StatePropagatorDataGpu::Impl::numAtomsAll()
555 {
556     return numAtomsAll_;
557 }
558
559
560 StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
561                                                const void*        localStream,
562                                                const void*        nonLocalStream,
563                                                const void*        deviceContext,
564                                                GpuApiCallBehavior transferKind,
565                                                int                paddingSize,
566                                                gmx_wallcycle*     wcycle) :
567     impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
568 {
569 }
570
571 StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
572                                                const void*        deviceContext,
573                                                GpuApiCallBehavior transferKind,
574                                                int                paddingSize,
575                                                gmx_wallcycle*     wcycle) :
576     impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
577 {
578 }
579
580 StatePropagatorDataGpu::StatePropagatorDataGpu(StatePropagatorDataGpu&& /* other */) noexcept = default;
581
582 StatePropagatorDataGpu& StatePropagatorDataGpu::operator=(StatePropagatorDataGpu&& /* other */) noexcept = default;
583
584 StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
585
586
587 void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
588 {
589     return impl_->reinit(numAtomsLocal, numAtomsAll);
590 }
591
592 std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
593 {
594     return impl_->getAtomRangesFromAtomLocality(atomLocality);
595 }
596
597
598 DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
599 {
600     return impl_->getCoordinates();
601 }
602
603 void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
604                                                   AtomLocality                         atomLocality)
605 {
606     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
607 }
608
609 GpuEventSynchronizer*
610 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
611                                                          const SimulationWorkload& simulationWork,
612                                                          const StepWorkload&       stepWork)
613 {
614     return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
615 }
616
617 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
618 {
619     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
620 }
621
622 GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
623 {
624     return impl_->xUpdatedOnDevice();
625 }
626
627 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
628 {
629     return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
630 }
631
632 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
633 {
634     return impl_->waitCoordinatesReadyOnHost(atomLocality);
635 }
636
637
638 DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
639 {
640     return impl_->getVelocities();
641 }
642
643 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
644                                                  AtomLocality                         atomLocality)
645 {
646     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
647 }
648
649 GpuEventSynchronizer* StatePropagatorDataGpu::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
650 {
651     return impl_->getVelocitiesReadyOnDeviceEvent(atomLocality);
652 }
653
654 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v, AtomLocality atomLocality)
655 {
656     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
657 }
658
659 void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
660 {
661     return impl_->waitVelocitiesReadyOnHost(atomLocality);
662 }
663
664
665 DeviceBuffer<float> StatePropagatorDataGpu::getForces()
666 {
667     return impl_->getForces();
668 }
669
670 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality)
671 {
672     return impl_->copyForcesToGpu(h_f, atomLocality);
673 }
674
675 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
676                                                                           bool useGpuFBufferOps)
677 {
678     return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
679 }
680
681 GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
682 {
683     return impl_->fReducedOnDevice();
684 }
685
686 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
687 {
688     return impl_->copyForcesFromGpu(h_f, atomLocality);
689 }
690
691 void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
692 {
693     return impl_->waitForcesReadyOnHost(atomLocality);
694 }
695
696
697 void* StatePropagatorDataGpu::getUpdateStream()
698 {
699     return impl_->getUpdateStream();
700 }
701
702 int StatePropagatorDataGpu::numAtomsLocal()
703 {
704     return impl_->numAtomsLocal();
705 }
706
707 int StatePropagatorDataGpu::numAtomsAll()
708 {
709     return impl_->numAtomsAll();
710 }
711
712 } // namespace gmx
713
714 #endif // GMX_GPU == GMX_GPU_NONE