Apply clang-tidy-11 fixes to CUDA files
[alexxy/gromacs.git] / src / gromacs / domdec / gpuhaloexchange_impl.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Implements GPU halo exchange using CUDA.
38  *
39  *
40  * \author Alan Gray <alang@nvidia.com>
41  *
42  * \ingroup module_domdec
43  */
44 #include "gmxpre.h"
45
46 #include "gpuhaloexchange_impl.cuh"
47
48 #include "config.h"
49
50 #include <assert.h>
51 #include <stdio.h>
52
53 #include <utility>
54
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/domdec/gpuhaloexchange.h"
58 #include "gromacs/gpu_utils/cudautils.cuh"
59 #include "gromacs/gpu_utils/device_context.h"
60 #include "gromacs/gpu_utils/devicebuffer.h"
61 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
62 #include "gromacs/gpu_utils/typecasts.cuh"
63 #include "gromacs/gpu_utils/vectype_ops.cuh"
64 #include "gromacs/math/vectypes.h"
65 #include "gromacs/pbcutil/ishift.h"
66 #include "gromacs/timing/wallcycle.h"
67 #include "gromacs/utility/gmxmpi.h"
68
69 #include "domdec_internal.h"
70
71 namespace gmx
72 {
73
74 //! Number of CUDA threads in a block
75 // TODO Optimize this through experimentation
76 constexpr static int c_threadsPerBlock = 256;
77
78 template<bool usePBC>
79 __global__ void packSendBufKernel(float3* __restrict__ dataPacked,
80                                   const float3* __restrict__ data,
81                                   const int* __restrict__ map,
82                                   const int    mapSize,
83                                   const float3 coordinateShift)
84 {
85     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
86     float3*       gm_dataDest = &dataPacked[threadIndex];
87     const float3* gm_dataSrc  = &data[map[threadIndex]];
88
89     if (threadIndex < mapSize)
90     {
91         if (usePBC)
92         {
93             *gm_dataDest = *gm_dataSrc + coordinateShift;
94         }
95         else
96         {
97             *gm_dataDest = *gm_dataSrc;
98         }
99     }
100 }
101
102 /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
103  * information \param[out] data        full array of force values \param[in]  dataPacked  packed
104  * array of force values to be transferred \param[in]  map         array of indices defining mapping
105  * from full to packed array \param[in]  mapSize     number of elements in map array
106  */
107 template<bool accumulate>
108 __global__ void unpackRecvBufKernel(float3* __restrict__ data,
109                                     const float3* __restrict__ dataPacked,
110                                     const int* __restrict__ map,
111                                     const int mapSize)
112 {
113
114     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
115     const float3* gm_dataSrc  = &dataPacked[threadIndex];
116     float3*       gm_dataDest = &data[map[threadIndex]];
117
118     if (threadIndex < mapSize)
119     {
120         if (accumulate)
121         {
122             *gm_dataDest += *gm_dataSrc;
123         }
124         else
125         {
126             *gm_dataDest = *gm_dataSrc;
127         }
128     }
129 }
130
131 void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
132 {
133     wallcycle_start(wcycle_, WallCycleCounter::Domdec);
134     wallcycle_sub_start(wcycle_, WallCycleSubCounter::DDGpu);
135
136     d_x_ = d_coordinatesBuffer;
137     d_f_ = d_forcesBuffer;
138
139     const gmx_domdec_comm_t&     comm = *dd_->comm;
140     const gmx_domdec_comm_dim_t& cd   = comm.cd[dimIndex_];
141     const gmx_domdec_ind_t&      ind  = cd.ind[pulse_];
142
143     numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank
144
145     // Determine receive offset for the dimension index and pulse of this halo exchange object
146     int numZoneTemp   = 1;
147     int numZone       = 0;
148     int numAtomsTotal = numHomeAtoms_;
149     for (int i = 0; i <= dimIndex_; i++)
150     {
151         int pulseMax = (i == dimIndex_) ? pulse_ : (comm.cd[i].numPulses() - 1);
152         for (int p = 0; p <= pulseMax; p++)
153         {
154             atomOffset_                     = numAtomsTotal;
155             const gmx_domdec_ind_t& indTemp = comm.cd[i].ind[p];
156             numAtomsTotal += indTemp.nrecv[numZoneTemp + 1];
157         }
158         numZone = numZoneTemp;
159         numZoneTemp += numZoneTemp;
160     }
161
162     int newSize = ind.nsend[numZone + 1];
163
164     GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
165
166     // reallocates only if needed
167     h_indexMap_.resize(newSize);
168     // reallocate on device only if needed
169     if (newSize > maxPackedBufferSize_)
170     {
171         reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
172         reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
173         reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
174         maxPackedBufferSize_ = newSize;
175     }
176
177     xSendSize_ = newSize;
178 #if GMX_MPI
179     MPI_Sendrecv(&xSendSize_,
180                  sizeof(int),
181                  MPI_BYTE,
182                  sendRankX_,
183                  0,
184                  &xRecvSize_,
185                  sizeof(int),
186                  MPI_BYTE,
187                  recvRankX_,
188                  0,
189                  mpi_comm_mysim_,
190                  MPI_STATUS_IGNORE);
191 #endif
192     fSendSize_ = xRecvSize_;
193     fRecvSize_ = xSendSize_;
194
195     if (newSize > 0)
196     {
197         GMX_ASSERT(ind.index.size() == h_indexMap_.size(),
198                    "Size mismatch between domain decomposition communication index array and GPU "
199                    "halo exchange index mapping array");
200         std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
201
202         copyToDeviceBuffer(
203                 &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
204     }
205
206 #if GMX_MPI
207     // Exchange of remote addresses from neighboring ranks is needed only with CUDA-direct as cudamemcpy needs both src/dst pointer
208     // MPI calls such as MPI_send doesn't worry about receiving address, that is taken care by MPI_recv call in neighboring rank
209     if (GMX_THREAD_MPI)
210     {
211         // This rank will push data to its neighbor, so needs to know
212         // the remote receive address and similarly send its receive
213         // address to other neighbour. We can do this here in reinit fn
214         // since the pointers will not change until the next NS step.
215
216         // Coordinates buffer:
217         float3* recvPtr = &d_x_[atomOffset_];
218         MPI_Sendrecv(&recvPtr,
219                      sizeof(void*),
220                      MPI_BYTE,
221                      recvRankX_,
222                      0,
223                      &remoteXPtr_,
224                      sizeof(void*),
225                      MPI_BYTE,
226                      sendRankX_,
227                      0,
228                      mpi_comm_mysim_,
229                      MPI_STATUS_IGNORE);
230
231         // Force buffer:
232         recvPtr = d_recvBuf_;
233         MPI_Sendrecv(&recvPtr,
234                      sizeof(void*),
235                      MPI_BYTE,
236                      recvRankF_,
237                      0,
238                      &remoteFPtr_,
239                      sizeof(void*),
240                      MPI_BYTE,
241                      sendRankF_,
242                      0,
243                      mpi_comm_mysim_,
244                      MPI_STATUS_IGNORE);
245     }
246 #endif
247
248     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
249     wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
250 }
251
252 void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
253 {
254     GMX_ASSERT(coordinatesReadyOnDeviceEvent != nullptr,
255                "Co-ordinate Halo exchange requires valid co-ordinate ready event");
256
257     // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
258     // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
259     // Similarly send event to task that will push data to this task.
260     GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
261     MPI_Sendrecv(&coordinatesReadyOnDeviceEvent,
262                  sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
263                  MPI_BYTE,
264                  recvRankX_,
265                  0,
266                  &remoteCoordinatesReadyOnDeviceEvent,
267                  sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
268                  MPI_BYTE,
269                  sendRankX_,
270                  0,
271                  mpi_comm_mysim_,
272                  MPI_STATUS_IGNORE);
273     remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
274 }
275
276 void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
277                                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
278 {
279
280     wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
281     if (pulse_ == 0)
282     {
283         // ensure stream waits until coordinate data is available on device
284         coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
285     }
286
287     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
288
289     // launch kernel to pack send buffer
290     KernelLaunchConfig config;
291     config.blockSize[0]     = c_threadsPerBlock;
292     config.blockSize[1]     = 1;
293     config.blockSize[2]     = 1;
294     config.gridSize[0]      = (xSendSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
295     config.gridSize[1]      = 1;
296     config.gridSize[2]      = 1;
297     config.sharedMemorySize = 0;
298
299     const float3* sendBuf  = d_sendBuf_;
300     const float3* d_x      = d_x_;
301     const int*    indexMap = d_indexMap_;
302     const int     size     = xSendSize_;
303     // The coordinateShift changes between steps when we have
304     // performed a DD partition, or have updated the box e.g. when
305     // performing pressure coupling. So, for simplicity, the box
306     // is used every step to pass the shift vector as an argument of
307     // the packing kernel.
308     const int    boxDimensionIndex = dd_->dim[dimIndex_];
309     const float3 coordinateShift{ box[boxDimensionIndex][XX],
310                                   box[boxDimensionIndex][YY],
311                                   box[boxDimensionIndex][ZZ] };
312
313     // Avoid launching kernel when there is no work to do
314     if (size > 0)
315     {
316         auto kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
317
318         const auto kernelArgs = prepareGpuKernelArguments(
319                 kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift);
320
321         launchGpuKernel(
322                 kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
323     }
324
325     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
326     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
327
328     // Consider time spent in communicateHaloData as Comm.X counter
329     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
330     wallcycle_start(wcycle_, WallCycleCounter::MoveX);
331
332     // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
333     // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
334     if (GMX_THREAD_MPI && pulse_ == 0)
335     {
336         enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
337     }
338
339     float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
340     communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
341
342     wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
343 }
344
345 // The following method should be called after non-local buffer operations,
346 // and before the local buffer operations. It operates in the non-local stream.
347 void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
348 {
349     // Consider time spent in communicateHaloData as Comm.F counter
350     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
351     wallcycle_start(wcycle_, WallCycleCounter::MoveF);
352
353     float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
354
355     // Communicate halo data (in non-local stream)
356     communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
357
358     wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
359
360     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
361     wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
362
363     float3* d_f = d_f_;
364     // If this is the last pulse and index (noting the force halo
365     // exchanges across multiple pulses and indices are called in
366     // reverse order) then perform the following preparation
367     // activities
368     if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
369     {
370         // ensure non-local stream waits for local stream, due to dependence on
371         // the previous H2D copy of CPU forces (if accumulateForces is true)
372         // or local force clearing.
373         GpuEventSynchronizer eventLocal;
374         eventLocal.markEvent(localStream_);
375         eventLocal.enqueueWaitEvent(nonLocalStream_);
376     }
377
378     // Unpack halo buffer into force array
379
380     KernelLaunchConfig config;
381     config.blockSize[0]     = c_threadsPerBlock;
382     config.blockSize[1]     = 1;
383     config.blockSize[2]     = 1;
384     config.gridSize[0]      = (fRecvSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
385     config.gridSize[1]      = 1;
386     config.gridSize[2]      = 1;
387     config.sharedMemorySize = 0;
388
389     const float3* recvBuf  = d_recvBuf_;
390     const int*    indexMap = d_indexMap_;
391     const int     size     = fRecvSize_;
392
393     if (pulse_ > 0 || dd_->ndim > 1)
394     {
395         // We need to accumulate rather than set, since it is possible
396         // that, in this pulse/dim, a value could be written to a location
397         // corresponding to the halo region of a following pulse/dim.
398         accumulateForces = true;
399     }
400
401     if (size > 0)
402     {
403         auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
404
405         const auto kernelArgs =
406                 prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
407
408         launchGpuKernel(
409                 kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
410     }
411
412     if (pulse_ == 0)
413     {
414         fReadyOnDevice_.markEvent(nonLocalStream_);
415     }
416
417     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
418     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
419 }
420
421 void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
422                                                 int     sendSize,
423                                                 int     sendRank,
424                                                 float3* recvPtr,
425                                                 int     recvSize,
426                                                 int     recvRank)
427 {
428     if (GMX_THREAD_MPI)
429     {
430         // no need to explicitly sync with GMX_THREAD_MPI as all operations are
431         // anyway launched in correct stream
432         communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, recvPtr, recvRank);
433     }
434     else
435     {
436         communicateHaloDataWithCudaMPI(sendPtr, sendSize, sendRank, recvPtr, recvSize, recvRank);
437     }
438 }
439
440 void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
441                                                            int     sendSize,
442                                                            int     sendRank,
443                                                            float3* recvPtr,
444                                                            int     recvSize,
445                                                            int     recvRank)
446 {
447     // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
448     if (sendSize > 0)
449     {
450         // wait for non local stream to complete all outstanding
451         // activities, to ensure that buffer is up-to-date in GPU memory
452         // before transferring to remote rank
453
454         // ToDo: Replace stream synchronize with event synchronize
455         nonLocalStream_.synchronize();
456     }
457
458     // perform halo exchange directly in device buffers
459 #if GMX_MPI
460     MPI_Request request;
461
462     // recv remote data into halo region
463     MPI_Irecv(recvPtr, recvSize * DIM, MPI_FLOAT, recvRank, 0, mpi_comm_mysim_, &request);
464
465     // send data to remote halo region
466     MPI_Send(sendPtr, sendSize * DIM, MPI_FLOAT, sendRank, 0, mpi_comm_mysim_);
467
468     MPI_Wait(&request, MPI_STATUS_IGNORE);
469 #endif
470 }
471
472 void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
473                                                               int     sendSize,
474                                                               int     sendRank,
475                                                               float3* remotePtr,
476                                                               int     recvRank)
477 {
478
479     cudaError_t stat;
480
481     // We asynchronously push data to remote rank. The remote
482     // destination pointer has already been set in the init fn.  We
483     // don't need to worry about overwriting data the remote ranks
484     // still needs since the halo exchange is just done once per
485     // timestep, for each of X and F.
486
487     // send data to neighbor, if any data exists to send
488     if (sendSize > 0)
489     {
490         stat = cudaMemcpyAsync(remotePtr,
491                                sendPtr,
492                                sendSize * DIM * sizeof(float),
493                                cudaMemcpyDeviceToDevice,
494                                nonLocalStream_.stream());
495
496         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
497     }
498
499 #if GMX_MPI
500     // ensure pushed data has arrived before remote rank progresses
501     // This rank records an event and sends it to the remote rank which has just been pushed data.
502     // This rank recieves event from remote rank which has pushed data here, and enqueues that event
503     // to its stream.
504     GpuEventSynchronizer* haloDataTransferRemote;
505
506     GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
507                "Halo exchange requires valid event to synchronize data transfer initiated in "
508                "remote rank");
509     haloDataTransferLaunched_->markEvent(nonLocalStream_);
510
511     MPI_Sendrecv(&haloDataTransferLaunched_,
512                  sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
513                  MPI_BYTE,
514                  sendRank,
515                  0,
516                  &haloDataTransferRemote,
517                  sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
518                  MPI_BYTE,
519                  recvRank,
520                  0,
521                  mpi_comm_mysim_,
522                  MPI_STATUS_IGNORE);
523
524     haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
525 #else
526     GMX_UNUSED_VALUE(sendRank);
527     GMX_UNUSED_VALUE(recvRank);
528 #endif
529 }
530
531 GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
532 {
533     return &fReadyOnDevice_;
534 }
535
536 /*! \brief Create Domdec GPU object */
537 GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
538                             int                  dimIndex,
539                             MPI_Comm             mpi_comm_mysim,
540                             const DeviceContext& deviceContext,
541                             const DeviceStream&  localStream,
542                             const DeviceStream&  nonLocalStream,
543                             int                  pulse,
544                             gmx_wallcycle*       wcycle) :
545     dd_(dd),
546     sendRankX_(dd->neighbor[dimIndex][1]),
547     recvRankX_(dd->neighbor[dimIndex][0]),
548     sendRankF_(dd->neighbor[dimIndex][0]),
549     recvRankF_(dd->neighbor[dimIndex][1]),
550     usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
551     haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
552     mpi_comm_mysim_(mpi_comm_mysim),
553     deviceContext_(deviceContext),
554     localStream_(localStream),
555     nonLocalStream_(nonLocalStream),
556     dimIndex_(dimIndex),
557     pulse_(pulse),
558     wcycle_(wcycle)
559 {
560     if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
561     {
562         gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
563     }
564
565     changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
566
567     allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
568 }
569
570 GpuHaloExchange::Impl::~Impl()
571 {
572     freeDeviceBuffer(&d_indexMap_);
573     freeDeviceBuffer(&d_sendBuf_);
574     freeDeviceBuffer(&d_recvBuf_);
575     freeDeviceBuffer(&d_fShift_);
576     delete haloDataTransferLaunched_;
577 }
578
579 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
580                                  int                  dimIndex,
581                                  MPI_Comm             mpi_comm_mysim,
582                                  const DeviceContext& deviceContext,
583                                  const DeviceStream&  localStream,
584                                  const DeviceStream&  nonLocalStream,
585                                  int                  pulse,
586                                  gmx_wallcycle*       wcycle) :
587     impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
588 {
589 }
590
591 GpuHaloExchange::GpuHaloExchange(GpuHaloExchange&&) noexcept = default;
592
593 GpuHaloExchange& GpuHaloExchange::operator=(GpuHaloExchange&& other) noexcept
594 {
595     std::swap(impl_, other.impl_);
596     return *this;
597 }
598
599 GpuHaloExchange::~GpuHaloExchange() = default;
600
601 void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
602 {
603     impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
604 }
605
606 void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
607                                                  GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
608 {
609     impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
610 }
611
612 void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
613 {
614     impl_->communicateHaloForces(accumulateForces);
615 }
616
617 GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
618 {
619     return impl_->getForcesReadyOnDeviceEvent();
620 }
621 } // namespace gmx