Merge branch release-2020 into master
[alexxy/gromacs.git] / src / gromacs / mdtypes / state_propagator_data_gpu_impl_gpu.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Definitions of interfaces for GPU state data propagator object.
38  *
39  * \author Artem Zhmurov <zhmurov@gmail.com>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "config.h"
46
47 #if GMX_GPU != GMX_GPU_NONE
48
49 #    if GMX_GPU == GMX_GPU_CUDA
50 #        include "gromacs/gpu_utils/cudautils.cuh"
51 #    endif
52 #    include "gromacs/gpu_utils/devicebuffer.h"
53 #    if GMX_GPU == GMX_GPU_OPENCL
54 #        include "gromacs/gpu_utils/oclutils.h"
55 #    endif
56 #    include "gromacs/math/vectypes.h"
57 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
58 #    include "gromacs/timing/wallcycle.h"
59 #    include "gromacs/utility/classhelpers.h"
60
61 #    include "state_propagator_data_gpu_impl.h"
62
63
64 namespace gmx
65 {
66
67 StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
68                                    const void*        localStream,
69                                    const void*        nonLocalStream,
70                                    const void*        deviceContext,
71                                    GpuApiCallBehavior transferKind,
72                                    int                paddingSize,
73                                    gmx_wallcycle*     wcycle) :
74     transferKind_(transferKind),
75     paddingSize_(paddingSize),
76     wcycle_(wcycle)
77 {
78     static_assert(GMX_GPU != GMX_GPU_NONE,
79                   "This object should only be constructed on the GPU code-paths.");
80
81     // TODO: Refactor when the StreamManager is introduced.
82     if (GMX_GPU == GMX_GPU_OPENCL)
83     {
84         GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
85         GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
86
87         // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
88         pmeStream_     = *static_cast<const CommandStream*>(pmeStream);
89         updateStream_  = *static_cast<const CommandStream*>(pmeStream);
90         deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
91         GMX_UNUSED_VALUE(localStream);
92         GMX_UNUSED_VALUE(nonLocalStream);
93     }
94
95     if (GMX_GPU == GMX_GPU_CUDA)
96     {
97         if (pmeStream != nullptr)
98         {
99             pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
100         }
101         if (localStream != nullptr)
102         {
103             localStream_ = *static_cast<const CommandStream*>(localStream);
104         }
105         if (nonLocalStream != nullptr)
106         {
107             nonLocalStream_ = *static_cast<const CommandStream*>(nonLocalStream);
108         }
109
110         // TODO: The update stream should be created only when it is needed.
111 #    if (GMX_GPU == GMX_GPU_CUDA)
112         cudaError_t stat;
113         stat = cudaStreamCreate(&updateStream_);
114         CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
115 #    endif
116         GMX_UNUSED_VALUE(deviceContext);
117     }
118
119     // Map the atom locality to the stream that will be used for coordinates,
120     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
121     // Note, that nullptr stream is used here to indicate that the copy is not supported.
122     xCopyStreams_[AtomLocality::Local]    = updateStream_;
123     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
124     xCopyStreams_[AtomLocality::All]      = updateStream_;
125
126     vCopyStreams_[AtomLocality::Local]    = updateStream_;
127     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
128     vCopyStreams_[AtomLocality::All]      = updateStream_;
129
130     fCopyStreams_[AtomLocality::Local]    = localStream_;
131     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
132     fCopyStreams_[AtomLocality::All]      = updateStream_;
133 }
134
135 StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
136                                    const void*        deviceContext,
137                                    GpuApiCallBehavior transferKind,
138                                    int                paddingSize,
139                                    gmx_wallcycle*     wcycle) :
140     transferKind_(transferKind),
141     paddingSize_(paddingSize),
142     wcycle_(wcycle)
143 {
144     static_assert(GMX_GPU != GMX_GPU_NONE,
145                   "This object should only be constructed on the GPU code-paths.");
146
147     if (GMX_GPU == GMX_GPU_OPENCL)
148     {
149         GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
150         deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
151     }
152
153     GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
154     pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
155
156     localStream_    = nullptr;
157     nonLocalStream_ = nullptr;
158     updateStream_   = nullptr;
159
160
161     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
162     // This it temporary measure to make it safe to use this class in those cases.
163     xCopyStreams_[AtomLocality::Local]    = pmeStream_;
164     xCopyStreams_[AtomLocality::NonLocal] = nullptr;
165     xCopyStreams_[AtomLocality::All]      = pmeStream_;
166
167     vCopyStreams_[AtomLocality::Local]    = nullptr;
168     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
169     vCopyStreams_[AtomLocality::All]      = nullptr;
170
171     fCopyStreams_[AtomLocality::Local]    = nullptr;
172     fCopyStreams_[AtomLocality::NonLocal] = nullptr;
173     fCopyStreams_[AtomLocality::All]      = nullptr;
174 }
175
176 StatePropagatorDataGpu::Impl::~Impl() {}
177
178 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
179 {
180     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
181     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
182
183     numAtomsLocal_ = numAtomsLocal;
184     numAtomsAll_   = numAtomsAll;
185
186     int numAtomsPadded;
187     if (paddingSize_ > 0)
188     {
189         numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
190     }
191     else
192     {
193         numAtomsPadded = numAtomsAll_;
194     }
195
196     reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
197
198     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
199     if (paddingAllocationSize > 0)
200     {
201         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
202         clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
203     }
204
205     reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
206     const int d_fOldCapacity = d_fCapacity_;
207     reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
208     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
209     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
210     // since the force buffer ops are not implemented in OpenCL.
211     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
212     {
213         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
214     }
215
216     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
217     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
218 }
219
220 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
221 {
222     int atomsStartAt   = 0;
223     int numAtomsToCopy = 0;
224     switch (atomLocality)
225     {
226         case AtomLocality::All:
227             atomsStartAt   = 0;
228             numAtomsToCopy = numAtomsAll_;
229             break;
230         case AtomLocality::Local:
231             atomsStartAt   = 0;
232             numAtomsToCopy = numAtomsLocal_;
233             break;
234         case AtomLocality::NonLocal:
235             atomsStartAt   = numAtomsLocal_;
236             numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
237             break;
238         default:
239             GMX_RELEASE_ASSERT(false,
240                                "Wrong range of atoms requested in GPU state data manager. Should "
241                                "be All, Local or NonLocal.");
242     }
243     GMX_ASSERT(atomsStartAt >= 0,
244                "The first elemtnt to copy has negative index. Probably, the GPU propagator state "
245                "was not initialized.");
246     GMX_ASSERT(numAtomsToCopy >= 0,
247                "Number of atoms to copy is negative. Probably, the GPU propagator state was not "
248                "initialized.");
249     return std::make_tuple(atomsStartAt, numAtomsToCopy);
250 }
251
252 void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
253                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
254                                                 int                                  dataSize,
255                                                 AtomLocality                         atomLocality,
256                                                 CommandStream                        commandStream)
257 {
258     GMX_UNUSED_VALUE(dataSize);
259
260     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
261
262     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
263
264     GMX_ASSERT(commandStream != nullptr,
265                "No stream is valid for copying with given atom locality.");
266     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
267     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
268
269     int atomsStartAt, numAtomsToCopy;
270     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
271
272     if (numAtomsToCopy != 0)
273     {
274         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
275                    "The device allocation is smaller than requested copy range.");
276         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
277                    "The host buffer is smaller than the requested copy range.");
278
279         copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
280                            atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
281     }
282
283     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
284     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
285 }
286
287 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
288                                                   DeviceBuffer<RVec>       d_data,
289                                                   int                      dataSize,
290                                                   AtomLocality             atomLocality,
291                                                   CommandStream            commandStream)
292 {
293     GMX_UNUSED_VALUE(dataSize);
294
295     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
296
297     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
298
299     GMX_ASSERT(commandStream != nullptr,
300                "No stream is valid for copying with given atom locality.");
301     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
302     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
303
304     int atomsStartAt, numAtomsToCopy;
305     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
306
307     if (numAtomsToCopy != 0)
308     {
309         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
310                    "The device allocation is smaller than requested copy range.");
311         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
312                    "The host buffer is smaller than the requested copy range.");
313
314         copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
315                              atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
316     }
317
318     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
319     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
320 }
321
322 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
323 {
324     return d_x_;
325 }
326
327 void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
328                                                         AtomLocality atomLocality)
329 {
330     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
331     CommandStream commandStream = xCopyStreams_[atomLocality];
332     GMX_ASSERT(commandStream != nullptr,
333                "No stream is valid for copying positions with given atom locality.");
334
335     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
336     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
337
338     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
339
340     // markEvent is skipped in OpenCL as:
341     //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
342     //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
343     // TODO: remove this by adding an event-mark free flavor of this function
344     if (GMX_GPU == GMX_GPU_CUDA)
345     {
346         xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
347     }
348
349     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
350     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
351 }
352
353 GpuEventSynchronizer*
354 StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
355                                                                const SimulationWorkload& simulationWork,
356                                                                const StepWorkload&       stepWork)
357 {
358     // The provider of the coordinates may be different for local atoms. If the update is offloaded
359     // and this is not a neighbor search step, then the consumer needs to wait for the update
360     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
361     // the copy event. Non-local coordinates are always provided by the H2D copy.
362     //
363     // TODO: This should be reconsidered to support the halo exchange.
364     //
365     // In OpenCL no events are used as coordinate sync is not necessary
366     if (GMX_GPU == GMX_GPU_OPENCL)
367     {
368         return nullptr;
369     }
370     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
371     {
372         return &xUpdatedOnDevice_;
373     }
374     else
375     {
376         return &xReadyOnDevice_[atomLocality];
377     }
378 }
379
380 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
381 {
382     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
383     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
384     xReadyOnDevice_[atomLocality].waitForEvent();
385     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
386 }
387
388 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
389 {
390     return &xUpdatedOnDevice_;
391 }
392
393 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
394 {
395     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
396     CommandStream commandStream = xCopyStreams_[atomLocality];
397     GMX_ASSERT(commandStream != nullptr,
398                "No stream is valid for copying positions with given atom locality.");
399
400     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
401     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
402
403     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
404     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
405     xReadyOnHost_[atomLocality].markEvent(commandStream);
406
407     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
408     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
409 }
410
411 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
412 {
413     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
414     xReadyOnHost_[atomLocality].waitForEvent();
415     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
416 }
417
418
419 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
420 {
421     return d_v_;
422 }
423
424 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
425                                                        AtomLocality atomLocality)
426 {
427     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
428     CommandStream commandStream = vCopyStreams_[atomLocality];
429     GMX_ASSERT(commandStream != nullptr,
430                "No stream is valid for copying velocities with given atom locality.");
431
432     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
433     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
434
435     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
436     vReadyOnDevice_[atomLocality].markEvent(commandStream);
437
438     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
439     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
440 }
441
442 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
443 {
444     return &vReadyOnDevice_[atomLocality];
445 }
446
447
448 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
449 {
450     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
451     CommandStream commandStream = vCopyStreams_[atomLocality];
452     GMX_ASSERT(commandStream != nullptr,
453                "No stream is valid for copying velocities with given atom locality.");
454
455     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
456     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
457
458     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
459     vReadyOnHost_[atomLocality].markEvent(commandStream);
460
461     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
462     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
463 }
464
465 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
466 {
467     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
468     vReadyOnHost_[atomLocality].waitForEvent();
469     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
470 }
471
472
473 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
474 {
475     return d_f_;
476 }
477
478 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
479                                                    AtomLocality atomLocality)
480 {
481     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
482     CommandStream commandStream = fCopyStreams_[atomLocality];
483     GMX_ASSERT(commandStream != nullptr,
484                "No stream is valid for copying forces with given atom locality.");
485
486     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
487     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
488
489     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
490     fReadyOnDevice_[atomLocality].markEvent(commandStream);
491
492     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
493     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
494 }
495
496 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
497                                                                                 bool useGpuFBufferOps)
498 {
499     if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
500     {
501         return &fReducedOnDevice_;
502     }
503     else
504     {
505         return &fReadyOnDevice_[atomLocality];
506     }
507 }
508
509 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
510 {
511     return &fReducedOnDevice_;
512 }
513
514 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
515 {
516     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
517     CommandStream commandStream = fCopyStreams_[atomLocality];
518     GMX_ASSERT(commandStream != nullptr,
519                "No stream is valid for copying forces with given atom locality.");
520
521     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
522     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
523
524     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
525     fReadyOnHost_[atomLocality].markEvent(commandStream);
526
527     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
528     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
529 }
530
531 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
532 {
533     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
534     fReadyOnHost_[atomLocality].waitForEvent();
535     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
536 }
537
538 void* StatePropagatorDataGpu::Impl::getUpdateStream()
539 {
540     return &updateStream_;
541 }
542
543 int StatePropagatorDataGpu::Impl::numAtomsLocal()
544 {
545     return numAtomsLocal_;
546 }
547
548 int StatePropagatorDataGpu::Impl::numAtomsAll()
549 {
550     return numAtomsAll_;
551 }
552
553
554 StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
555                                                const void*        localStream,
556                                                const void*        nonLocalStream,
557                                                const void*        deviceContext,
558                                                GpuApiCallBehavior transferKind,
559                                                int                paddingSize,
560                                                gmx_wallcycle*     wcycle) :
561     impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
562 {
563 }
564
565 StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
566                                                const void*        deviceContext,
567                                                GpuApiCallBehavior transferKind,
568                                                int                paddingSize,
569                                                gmx_wallcycle*     wcycle) :
570     impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
571 {
572 }
573
574 StatePropagatorDataGpu::StatePropagatorDataGpu(StatePropagatorDataGpu&& /* other */) noexcept = default;
575
576 StatePropagatorDataGpu& StatePropagatorDataGpu::operator=(StatePropagatorDataGpu&& /* other */) noexcept = default;
577
578 StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
579
580
581 void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
582 {
583     return impl_->reinit(numAtomsLocal, numAtomsAll);
584 }
585
586 std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
587 {
588     return impl_->getAtomRangesFromAtomLocality(atomLocality);
589 }
590
591
592 DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
593 {
594     return impl_->getCoordinates();
595 }
596
597 void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
598                                                   AtomLocality                         atomLocality)
599 {
600     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
601 }
602
603 GpuEventSynchronizer*
604 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
605                                                          const SimulationWorkload& simulationWork,
606                                                          const StepWorkload&       stepWork)
607 {
608     return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
609 }
610
611 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
612 {
613     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
614 }
615
616 GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
617 {
618     return impl_->xUpdatedOnDevice();
619 }
620
621 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
622 {
623     return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
624 }
625
626 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
627 {
628     return impl_->waitCoordinatesReadyOnHost(atomLocality);
629 }
630
631
632 DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
633 {
634     return impl_->getVelocities();
635 }
636
637 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
638                                                  AtomLocality                         atomLocality)
639 {
640     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
641 }
642
643 GpuEventSynchronizer* StatePropagatorDataGpu::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
644 {
645     return impl_->getVelocitiesReadyOnDeviceEvent(atomLocality);
646 }
647
648 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v, AtomLocality atomLocality)
649 {
650     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
651 }
652
653 void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
654 {
655     return impl_->waitVelocitiesReadyOnHost(atomLocality);
656 }
657
658
659 DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
660 {
661     return impl_->getForces();
662 }
663
664 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality)
665 {
666     return impl_->copyForcesToGpu(h_f, atomLocality);
667 }
668
669 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
670                                                                           bool useGpuFBufferOps)
671 {
672     return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
673 }
674
675 GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
676 {
677     return impl_->fReducedOnDevice();
678 }
679
680 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
681 {
682     return impl_->copyForcesFromGpu(h_f, atomLocality);
683 }
684
685 void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
686 {
687     return impl_->waitForcesReadyOnHost(atomLocality);
688 }
689
690
691 void* StatePropagatorDataGpu::getUpdateStream()
692 {
693     return impl_->getUpdateStream();
694 }
695
696 int StatePropagatorDataGpu::numAtomsLocal()
697 {
698     return impl_->numAtomsLocal();
699 }
700
701 int StatePropagatorDataGpu::numAtomsAll()
702 {
703     return impl_->numAtomsAll();
704 }
705
706 } // namespace gmx
707
708 #endif // GMX_GPU == GMX_GPU_NONE