65af08d35d73aee7b8a594a0f832962e9bdd44e1
[alexxy/gromacs.git] / src / gromacs / domdec / gpuhaloexchange_impl.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Implements GPU halo exchange using CUDA.
38  *
39  *
40  * \author Alan Gray <alang@nvidia.com>
41  *
42  * \ingroup module_domdec
43  */
44 #include "gmxpre.h"
45
46 #include "gpuhaloexchange_impl.cuh"
47
48 #include "config.h"
49
50 #include <assert.h>
51 #include <stdio.h>
52
53 #include <utility>
54
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/domdec/gpuhaloexchange.h"
58 #include "gromacs/gpu_utils/cudautils.cuh"
59 #include "gromacs/gpu_utils/device_context.h"
60 #include "gromacs/gpu_utils/devicebuffer.h"
61 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
62 #include "gromacs/gpu_utils/typecasts.cuh"
63 #include "gromacs/gpu_utils/vectype_ops.cuh"
64 #include "gromacs/math/vectypes.h"
65 #include "gromacs/pbcutil/ishift.h"
66 #include "gromacs/timing/wallcycle.h"
67 #include "gromacs/utility/gmxmpi.h"
68
69 #include "domdec_internal.h"
70
71 namespace gmx
72 {
73
74 //! Number of CUDA threads in a block
75 // TODO Optimize this through experimentation
76 constexpr static int c_threadsPerBlock = 256;
77
78 template<bool usePBC>
79 __global__ void packSendBufKernel(float3* __restrict__ dataPacked,
80                                   const float3* __restrict__ data,
81                                   const int* __restrict__ map,
82                                   const int    mapSize,
83                                   const float3 coordinateShift)
84 {
85     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
86     float3*       gm_dataDest = &dataPacked[threadIndex];
87     const float3* gm_dataSrc  = &data[map[threadIndex]];
88
89     if (threadIndex < mapSize)
90     {
91         if (usePBC)
92         {
93             *gm_dataDest = *gm_dataSrc + coordinateShift;
94         }
95         else
96         {
97             *gm_dataDest = *gm_dataSrc;
98         }
99     }
100
101     return;
102 }
103
104 /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
105  * information \param[out] data        full array of force values \param[in]  dataPacked  packed
106  * array of force values to be transferred \param[in]  map         array of indices defining mapping
107  * from full to packed array \param[in]  mapSize     number of elements in map array
108  */
109 template<bool accumulate>
110 __global__ void unpackRecvBufKernel(float3* __restrict__ data,
111                                     const float3* __restrict__ dataPacked,
112                                     const int* __restrict__ map,
113                                     const int mapSize)
114 {
115
116     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
117     const float3* gm_dataSrc  = &dataPacked[threadIndex];
118     float3*       gm_dataDest = &data[map[threadIndex]];
119
120     if (threadIndex < mapSize)
121     {
122         if (accumulate)
123         {
124             *gm_dataDest += *gm_dataSrc;
125         }
126         else
127         {
128             *gm_dataDest = *gm_dataSrc;
129         }
130     }
131
132     return;
133 }
134
135 void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
136 {
137     wallcycle_start(wcycle_, ewcDOMDEC);
138     wallcycle_sub_start(wcycle_, ewcsDD_GPU);
139
140     d_x_ = d_coordinatesBuffer;
141     d_f_ = d_forcesBuffer;
142
143     const gmx_domdec_comm_t&     comm = *dd_->comm;
144     const gmx_domdec_comm_dim_t& cd   = comm.cd[dimIndex_];
145     const gmx_domdec_ind_t&      ind  = cd.ind[pulse_];
146
147     numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank
148
149     // Determine receive offset for the dimension index and pulse of this halo exchange object
150     int numZoneTemp   = 1;
151     int numZone       = 0;
152     int numAtomsTotal = numHomeAtoms_;
153     for (int i = 0; i <= dimIndex_; i++)
154     {
155         int pulseMax = (i == dimIndex_) ? pulse_ : (comm.cd[i].numPulses() - 1);
156         for (int p = 0; p <= pulseMax; p++)
157         {
158             atomOffset_                     = numAtomsTotal;
159             const gmx_domdec_ind_t& indTemp = comm.cd[i].ind[p];
160             numAtomsTotal += indTemp.nrecv[numZoneTemp + 1];
161         }
162         numZone = numZoneTemp;
163         numZoneTemp += numZoneTemp;
164     }
165
166     int newSize = ind.nsend[numZone + 1];
167
168     GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
169
170     // reallocates only if needed
171     h_indexMap_.resize(newSize);
172     // reallocate on device only if needed
173     if (newSize > maxPackedBufferSize_)
174     {
175         reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
176         reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
177         reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
178         maxPackedBufferSize_ = newSize;
179     }
180
181     xSendSize_ = newSize;
182 #if GMX_MPI
183     MPI_Sendrecv(&xSendSize_,
184                  sizeof(int),
185                  MPI_BYTE,
186                  sendRankX_,
187                  0,
188                  &xRecvSize_,
189                  sizeof(int),
190                  MPI_BYTE,
191                  recvRankX_,
192                  0,
193                  mpi_comm_mysim_,
194                  MPI_STATUS_IGNORE);
195 #endif
196     fSendSize_ = xRecvSize_;
197     fRecvSize_ = xSendSize_;
198
199     if (newSize > 0)
200     {
201         GMX_ASSERT(ind.index.size() == h_indexMap_.size(),
202                    "Size mismatch between domain decomposition communication index array and GPU "
203                    "halo exchange index mapping array");
204         std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
205
206         copyToDeviceBuffer(
207                 &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
208     }
209
210 #if GMX_MPI
211     // Exchange of remote addresses from neighboring ranks is needed only with CUDA-direct as cudamemcpy needs both src/dst pointer
212     // MPI calls such as MPI_send doesn't worry about receiving address, that is taken care by MPI_recv call in neighboring rank
213     if (GMX_THREAD_MPI)
214     {
215         // This rank will push data to its neighbor, so needs to know
216         // the remote receive address and similarly send its receive
217         // address to other neighbour. We can do this here in reinit fn
218         // since the pointers will not change until the next NS step.
219
220         // Coordinates buffer:
221         float3* recvPtr = &d_x_[atomOffset_];
222         MPI_Sendrecv(&recvPtr,
223                      sizeof(void*),
224                      MPI_BYTE,
225                      recvRankX_,
226                      0,
227                      &remoteXPtr_,
228                      sizeof(void*),
229                      MPI_BYTE,
230                      sendRankX_,
231                      0,
232                      mpi_comm_mysim_,
233                      MPI_STATUS_IGNORE);
234
235         // Force buffer:
236         recvPtr = d_recvBuf_;
237         MPI_Sendrecv(&recvPtr,
238                      sizeof(void*),
239                      MPI_BYTE,
240                      recvRankF_,
241                      0,
242                      &remoteFPtr_,
243                      sizeof(void*),
244                      MPI_BYTE,
245                      sendRankF_,
246                      0,
247                      mpi_comm_mysim_,
248                      MPI_STATUS_IGNORE);
249     }
250 #endif
251
252     wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
253     wallcycle_stop(wcycle_, ewcDOMDEC);
254
255     return;
256 }
257
258 void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
259 {
260     GMX_ASSERT(coordinatesReadyOnDeviceEvent != nullptr,
261                "Co-ordinate Halo exchange requires valid co-ordinate ready event");
262
263     // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
264     // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
265     // Similarly send event to task that will push data to this task.
266     GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
267     MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
268                  sizeof(GpuEventSynchronizer*),
269                  MPI_BYTE,
270                  recvRankX_,
271                  0,
272                  &remoteCoordinatesReadyOnDeviceEvent,
273                  sizeof(GpuEventSynchronizer*),
274                  MPI_BYTE,
275                  sendRankX_,
276                  0,
277                  mpi_comm_mysim_,
278                  MPI_STATUS_IGNORE);
279     remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
280 }
281
282 void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
283                                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
284 {
285
286     wallcycle_start(wcycle_, ewcLAUNCH_GPU);
287     if (pulse_ == 0)
288     {
289         // ensure stream waits until coordinate data is available on device
290         coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
291     }
292
293     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
294
295     // launch kernel to pack send buffer
296     KernelLaunchConfig config;
297     config.blockSize[0]     = c_threadsPerBlock;
298     config.blockSize[1]     = 1;
299     config.blockSize[2]     = 1;
300     config.gridSize[0]      = (xSendSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
301     config.gridSize[1]      = 1;
302     config.gridSize[2]      = 1;
303     config.sharedMemorySize = 0;
304
305     const float3* sendBuf  = d_sendBuf_;
306     const float3* d_x      = d_x_;
307     const int*    indexMap = d_indexMap_;
308     const int     size     = xSendSize_;
309     // The coordinateShift changes between steps when we have
310     // performed a DD partition, or have updated the box e.g. when
311     // performing pressure coupling. So, for simplicity, the box
312     // is used every step to pass the shift vector as an argument of
313     // the packing kernel.
314     const int    boxDimensionIndex = dd_->dim[dimIndex_];
315     const float3 coordinateShift{ box[boxDimensionIndex][XX],
316                                   box[boxDimensionIndex][YY],
317                                   box[boxDimensionIndex][ZZ] };
318
319     // Avoid launching kernel when there is no work to do
320     if (size > 0)
321     {
322         auto kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
323
324         const auto kernelArgs = prepareGpuKernelArguments(
325                 kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift);
326
327         launchGpuKernel(
328                 kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
329     }
330
331     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
332     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
333
334     // Consider time spent in communicateHaloData as Comm.X counter
335     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
336     wallcycle_start(wcycle_, ewcMOVEX);
337
338     // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
339     // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
340     if (GMX_THREAD_MPI && pulse_ == 0)
341     {
342         enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
343     }
344
345     float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
346     communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
347
348     wallcycle_stop(wcycle_, ewcMOVEX);
349
350     return;
351 }
352
353 // The following method should be called after non-local buffer operations,
354 // and before the local buffer operations. It operates in the non-local stream.
355 void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
356 {
357     // Consider time spent in communicateHaloData as Comm.F counter
358     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
359     wallcycle_start(wcycle_, ewcMOVEF);
360
361     float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
362
363     // Communicate halo data (in non-local stream)
364     communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
365
366     wallcycle_stop(wcycle_, ewcMOVEF);
367
368     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
369     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
370
371     float3* d_f = d_f_;
372     // If this is the last pulse and index (noting the force halo
373     // exchanges across multiple pulses and indices are called in
374     // reverse order) then perform the following preparation
375     // activities
376     if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
377     {
378         // ensure non-local stream waits for local stream, due to dependence on
379         // the previous H2D copy of CPU forces (if accumulateForces is true)
380         // or local force clearing.
381         GpuEventSynchronizer eventLocal;
382         eventLocal.markEvent(localStream_);
383         eventLocal.enqueueWaitEvent(nonLocalStream_);
384     }
385
386     // Unpack halo buffer into force array
387
388     KernelLaunchConfig config;
389     config.blockSize[0]     = c_threadsPerBlock;
390     config.blockSize[1]     = 1;
391     config.blockSize[2]     = 1;
392     config.gridSize[0]      = (fRecvSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
393     config.gridSize[1]      = 1;
394     config.gridSize[2]      = 1;
395     config.sharedMemorySize = 0;
396
397     const float3* recvBuf  = d_recvBuf_;
398     const int*    indexMap = d_indexMap_;
399     const int     size     = fRecvSize_;
400
401     if (pulse_ > 0 || dd_->ndim > 1)
402     {
403         // We need to accumulate rather than set, since it is possible
404         // that, in this pulse/dim, a value could be written to a location
405         // corresponding to the halo region of a following pulse/dim.
406         accumulateForces = true;
407     }
408
409     if (size > 0)
410     {
411         auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
412
413         const auto kernelArgs =
414                 prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
415
416         launchGpuKernel(
417                 kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
418     }
419
420     if (pulse_ == 0)
421     {
422         fReadyOnDevice_.markEvent(nonLocalStream_);
423     }
424
425     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
426     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
427 }
428
429 void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
430                                                 int     sendSize,
431                                                 int     sendRank,
432                                                 float3* recvPtr,
433                                                 int     recvSize,
434                                                 int     recvRank)
435 {
436     if (GMX_THREAD_MPI)
437     {
438         // no need to explicitly sync with GMX_THREAD_MPI as all operations are
439         // anyway launched in correct stream
440         communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, recvPtr, recvRank);
441     }
442     else
443     {
444         communicateHaloDataWithCudaMPI(sendPtr, sendSize, sendRank, recvPtr, recvSize, recvRank);
445     }
446 }
447
448 void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
449                                                            int     sendSize,
450                                                            int     sendRank,
451                                                            float3* recvPtr,
452                                                            int     recvSize,
453                                                            int     recvRank)
454 {
455     // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
456     if (sendSize > 0)
457     {
458         // wait for non local stream to complete all outstanding
459         // activities, to ensure that buffer is up-to-date in GPU memory
460         // before transferring to remote rank
461
462         // ToDo: Replace stream synchronize with event synchronize
463         nonLocalStream_.synchronize();
464     }
465
466     // perform halo exchange directly in device buffers
467 #if GMX_MPI
468     MPI_Request request;
469
470     // recv remote data into halo region
471     MPI_Irecv(recvPtr, recvSize * DIM, MPI_FLOAT, recvRank, 0, mpi_comm_mysim_, &request);
472
473     // send data to remote halo region
474     MPI_Send(sendPtr, sendSize * DIM, MPI_FLOAT, sendRank, 0, mpi_comm_mysim_);
475
476     MPI_Wait(&request, MPI_STATUS_IGNORE);
477 #endif
478 }
479
480 void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
481                                                               int     sendSize,
482                                                               int     sendRank,
483                                                               float3* remotePtr,
484                                                               int     recvRank)
485 {
486
487     cudaError_t stat;
488
489     // We asynchronously push data to remote rank. The remote
490     // destination pointer has already been set in the init fn.  We
491     // don't need to worry about overwriting data the remote ranks
492     // still needs since the halo exchange is just done once per
493     // timestep, for each of X and F.
494
495     // send data to neighbor, if any data exists to send
496     if (sendSize > 0)
497     {
498         stat = cudaMemcpyAsync(remotePtr,
499                                sendPtr,
500                                sendSize * DIM * sizeof(float),
501                                cudaMemcpyDeviceToDevice,
502                                nonLocalStream_.stream());
503
504         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
505     }
506
507 #if GMX_MPI
508     // ensure pushed data has arrived before remote rank progresses
509     // This rank records an event and sends it to the remote rank which has just been pushed data.
510     // This rank recieves event from remote rank which has pushed data here, and enqueues that event
511     // to its stream.
512     GpuEventSynchronizer* haloDataTransferRemote;
513
514     GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
515                "Halo exchange requires valid event to synchronize data transfer initiated in "
516                "remote rank");
517     haloDataTransferLaunched_->markEvent(nonLocalStream_);
518
519     MPI_Sendrecv(&haloDataTransferLaunched_,
520                  sizeof(GpuEventSynchronizer*),
521                  MPI_BYTE,
522                  sendRank,
523                  0,
524                  &haloDataTransferRemote,
525                  sizeof(GpuEventSynchronizer*),
526                  MPI_BYTE,
527                  recvRank,
528                  0,
529                  mpi_comm_mysim_,
530                  MPI_STATUS_IGNORE);
531
532     haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
533 #else
534     GMX_UNUSED_VALUE(sendRank);
535     GMX_UNUSED_VALUE(recvRank);
536 #endif
537 }
538
539 GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
540 {
541     return &fReadyOnDevice_;
542 }
543
544 /*! \brief Create Domdec GPU object */
545 GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
546                             int                  dimIndex,
547                             MPI_Comm             mpi_comm_mysim,
548                             const DeviceContext& deviceContext,
549                             const DeviceStream&  localStream,
550                             const DeviceStream&  nonLocalStream,
551                             int                  pulse,
552                             gmx_wallcycle*       wcycle) :
553     dd_(dd),
554     sendRankX_(dd->neighbor[dimIndex][1]),
555     recvRankX_(dd->neighbor[dimIndex][0]),
556     sendRankF_(dd->neighbor[dimIndex][0]),
557     recvRankF_(dd->neighbor[dimIndex][1]),
558     usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
559     haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
560     mpi_comm_mysim_(mpi_comm_mysim),
561     deviceContext_(deviceContext),
562     localStream_(localStream),
563     nonLocalStream_(nonLocalStream),
564     dimIndex_(dimIndex),
565     pulse_(pulse),
566     wcycle_(wcycle)
567 {
568     if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
569     {
570         gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
571     }
572
573     changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
574
575     allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
576 }
577
578 GpuHaloExchange::Impl::~Impl()
579 {
580     freeDeviceBuffer(&d_indexMap_);
581     freeDeviceBuffer(&d_sendBuf_);
582     freeDeviceBuffer(&d_recvBuf_);
583     freeDeviceBuffer(&d_fShift_);
584     delete haloDataTransferLaunched_;
585 }
586
587 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
588                                  int                  dimIndex,
589                                  MPI_Comm             mpi_comm_mysim,
590                                  const DeviceContext& deviceContext,
591                                  const DeviceStream&  localStream,
592                                  const DeviceStream&  nonLocalStream,
593                                  int                  pulse,
594                                  gmx_wallcycle*       wcycle) :
595     impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
596 {
597 }
598
599 GpuHaloExchange::GpuHaloExchange(GpuHaloExchange&&) noexcept = default;
600
601 GpuHaloExchange& GpuHaloExchange::operator=(GpuHaloExchange&& other) noexcept
602 {
603     std::swap(impl_, other.impl_);
604     return *this;
605 }
606
607 GpuHaloExchange::~GpuHaloExchange() = default;
608
609 void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
610 {
611     impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
612 }
613
614 void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
615                                                  GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
616 {
617     impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
618 }
619
620 void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
621 {
622     impl_->communicateHaloForces(accumulateForces);
623 }
624
625 GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
626 {
627     return impl_->getForcesReadyOnDeviceEvent();
628 }
629 } // namespace gmx