Merge branch release-2020 into merge-2020-into-2021
[alexxy/gromacs.git] / src / gromacs / domdec / gpuhaloexchange_impl.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Implements GPU halo exchange using CUDA.
38  *
39  *
40  * \author Alan Gray <alang@nvidia.com>
41  *
42  * \ingroup module_domdec
43  */
44 #include "gmxpre.h"
45
46 #include "gpuhaloexchange_impl.cuh"
47
48 #include "config.h"
49
50 #include <assert.h>
51 #include <stdio.h>
52
53 #include <utility>
54
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/domdec/domdec_struct.h"
57 #include "gromacs/domdec/gpuhaloexchange.h"
58 #include "gromacs/gpu_utils/cudautils.cuh"
59 #include "gromacs/gpu_utils/device_context.h"
60 #include "gromacs/gpu_utils/devicebuffer.h"
61 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
62 #include "gromacs/gpu_utils/typecasts.cuh"
63 #include "gromacs/gpu_utils/vectype_ops.cuh"
64 #include "gromacs/math/vectypes.h"
65 #include "gromacs/pbcutil/ishift.h"
66 #include "gromacs/timing/wallcycle.h"
67 #include "gromacs/utility/gmxmpi.h"
68
69 #include "domdec_internal.h"
70
71 namespace gmx
72 {
73
74 //! Number of CUDA threads in a block
75 // TODO Optimize this through experimentation
76 constexpr static int c_threadsPerBlock = 256;
77
78 template<bool usePBC>
79 __global__ void packSendBufKernel(float3* __restrict__ dataPacked,
80                                   const float3* __restrict__ data,
81                                   const int* __restrict__ map,
82                                   const int    mapSize,
83                                   const float3 coordinateShift)
84 {
85     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
86     float3*       gm_dataDest = &dataPacked[threadIndex];
87     const float3* gm_dataSrc  = &data[map[threadIndex]];
88
89     if (threadIndex < mapSize)
90     {
91         if (usePBC)
92         {
93             *gm_dataDest = *gm_dataSrc + coordinateShift;
94         }
95         else
96         {
97             *gm_dataDest = *gm_dataSrc;
98         }
99     }
100
101     return;
102 }
103
104 /*! \brief unpack non-local force data buffer on the GPU using pre-populated "map" containing index
105  * information \param[out] data        full array of force values \param[in]  dataPacked  packed
106  * array of force values to be transferred \param[in]  map         array of indices defining mapping
107  * from full to packed array \param[in]  mapSize     number of elements in map array
108  */
109 template<bool accumulate>
110 __global__ void unpackRecvBufKernel(float3* __restrict__ data,
111                                     const float3* __restrict__ dataPacked,
112                                     const int* __restrict__ map,
113                                     const int mapSize)
114 {
115
116     int           threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
117     const float3* gm_dataSrc  = &dataPacked[threadIndex];
118     float3*       gm_dataDest = &data[map[threadIndex]];
119
120     if (threadIndex < mapSize)
121     {
122         if (accumulate)
123         {
124             *gm_dataDest += *gm_dataSrc;
125         }
126         else
127         {
128             *gm_dataDest = *gm_dataSrc;
129         }
130     }
131
132     return;
133 }
134
135 void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
136 {
137     wallcycle_start(wcycle_, ewcDOMDEC);
138     wallcycle_sub_start(wcycle_, ewcsDD_GPU);
139
140     d_x_ = d_coordinatesBuffer;
141     d_f_ = d_forcesBuffer;
142
143     const gmx_domdec_comm_t&     comm = *dd_->comm;
144     const gmx_domdec_comm_dim_t& cd   = comm.cd[dimIndex_];
145     const gmx_domdec_ind_t&      ind  = cd.ind[pulse_];
146
147     numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank
148
149     // Determine receive offset for the dimension index and pulse of this halo exchange object
150     int numZoneTemp   = 1;
151     int numZone       = 0;
152     int numAtomsTotal = numHomeAtoms_;
153     for (int i = 0; i <= dimIndex_; i++)
154     {
155         int pulseMax = (i == dimIndex_) ? pulse_ : (comm.cd[i].numPulses() - 1);
156         for (int p = 0; p <= pulseMax; p++)
157         {
158             atomOffset_                     = numAtomsTotal;
159             const gmx_domdec_ind_t& indTemp = comm.cd[i].ind[p];
160             numAtomsTotal += indTemp.nrecv[numZoneTemp + 1];
161         }
162         numZone = numZoneTemp;
163         numZoneTemp += numZoneTemp;
164     }
165
166     int newSize = ind.nsend[numZone + 1];
167
168     GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
169
170     // reallocates only if needed
171     h_indexMap_.resize(newSize);
172     // reallocate on device only if needed
173     if (newSize > maxPackedBufferSize_)
174     {
175         reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
176         reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
177         reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
178         maxPackedBufferSize_ = newSize;
179     }
180
181     xSendSize_ = newSize;
182 #if GMX_MPI
183     MPI_Sendrecv(&xSendSize_, sizeof(int), MPI_BYTE, sendRankX_, 0, &xRecvSize_, sizeof(int),
184                  MPI_BYTE, recvRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
185 #endif
186     fSendSize_ = xRecvSize_;
187     fRecvSize_ = xSendSize_;
188
189     if (newSize > 0)
190     {
191         GMX_ASSERT(ind.index.size() == h_indexMap_.size(),
192                    "Size mismatch between domain decomposition communication index array and GPU "
193                    "halo exchange index mapping array");
194         std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
195
196         copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
197                            GpuApiCallBehavior::Async, nullptr);
198     }
199     // This rank will push data to its neighbor, so needs to know
200     // the remote receive address and similarly send its receive
201     // address to other neighbour. We can do this here in reinit fn
202     // since the pointers will not change until the next NS step.
203
204     // Coordinates buffer:
205     void* recvPtr = static_cast<void*>(&d_x_[atomOffset_]);
206 #if GMX_MPI
207     MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*),
208                  MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
209
210     // Force buffer:
211     recvPtr = static_cast<void*>(d_recvBuf_);
212     MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankF_, 0, &remoteFPtr_, sizeof(void*),
213                  MPI_BYTE, sendRankF_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
214 #endif
215
216     wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
217     wallcycle_stop(wcycle_, ewcDOMDEC);
218
219     return;
220 }
221
222 void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
223                                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
224 {
225
226     wallcycle_start(wcycle_, ewcLAUNCH_GPU);
227     if (pulse_ == 0)
228     {
229         // ensure stream waits until coordinate data is available on device
230         coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
231     }
232
233     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
234
235     // launch kernel to pack send buffer
236     KernelLaunchConfig config;
237     config.blockSize[0]     = c_threadsPerBlock;
238     config.blockSize[1]     = 1;
239     config.blockSize[2]     = 1;
240     config.gridSize[0]      = (xSendSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
241     config.gridSize[1]      = 1;
242     config.gridSize[2]      = 1;
243     config.sharedMemorySize = 0;
244
245     const float3* sendBuf  = d_sendBuf_;
246     const float3* d_x      = d_x_;
247     const int*    indexMap = d_indexMap_;
248     const int     size     = xSendSize_;
249     // The coordinateShift changes between steps when we have
250     // performed a DD partition, or have updated the box e.g. when
251     // performing pressure coupling. So, for simplicity, the box
252     // is used every step to pass the shift vector as an argument of
253     // the packing kernel.
254     const int    boxDimensionIndex = dd_->dim[dimIndex_];
255     const float3 coordinateShift{ box[boxDimensionIndex][XX], box[boxDimensionIndex][YY],
256                                   box[boxDimensionIndex][ZZ] };
257
258     // Avoid launching kernel when there is no work to do
259     if (size > 0)
260     {
261         auto kernelFn = usePBC_ ? packSendBufKernel<true> : packSendBufKernel<false>;
262
263         const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x,
264                                                           &indexMap, &size, &coordinateShift);
265
266         launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr,
267                         "Domdec GPU Apply X Halo Exchange", kernelArgs);
268     }
269
270     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
271     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
272
273     // Consider time spent in communicateHaloData as Comm.X counter
274     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
275     wallcycle_start(wcycle_, ewcMOVEX);
276
277     communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
278
279     wallcycle_stop(wcycle_, ewcMOVEX);
280
281     return;
282 }
283
284 // The following method should be called after non-local buffer operations,
285 // and before the local buffer operations. It operates in the non-local stream.
286 void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
287 {
288     // Consider time spent in communicateHaloData as Comm.F counter
289     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
290     wallcycle_start(wcycle_, ewcMOVEF);
291
292     // Communicate halo data (in non-local stream)
293     communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
294
295     wallcycle_stop(wcycle_, ewcMOVEF);
296
297     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
298     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
299
300     float3* d_f = d_f_;
301     // If this is the last pulse and index (noting the force halo
302     // exchanges across multiple pulses and indices are called in
303     // reverse order) then perform the following preparation
304     // activities
305     if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
306     {
307         if (!accumulateForces)
308         {
309             // Clear local portion of force array (in local stream)
310             cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
311         }
312
313         // ensure non-local stream waits for local stream, due to dependence on
314         // the previous H2D copy of CPU forces (if accumulateForces is true)
315         // or the above clearing.
316         // TODO remove this dependency on localStream - edmine Issue #3093
317         GpuEventSynchronizer eventLocal;
318         eventLocal.markEvent(localStream_);
319         eventLocal.enqueueWaitEvent(nonLocalStream_);
320     }
321
322     // Unpack halo buffer into force array
323
324     KernelLaunchConfig config;
325     config.blockSize[0]     = c_threadsPerBlock;
326     config.blockSize[1]     = 1;
327     config.blockSize[2]     = 1;
328     config.gridSize[0]      = (fRecvSize_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
329     config.gridSize[1]      = 1;
330     config.gridSize[2]      = 1;
331     config.sharedMemorySize = 0;
332
333     const float3* recvBuf  = d_recvBuf_;
334     const int*    indexMap = d_indexMap_;
335     const int     size     = fRecvSize_;
336
337     if (pulse_ > 0 || dd_->ndim > 1)
338     {
339         // We need to accumulate rather than set, since it is possible
340         // that, in this pulse/dim, a value could be written to a location
341         // corresponding to the halo region of a following pulse/dim.
342         accumulateForces = true;
343     }
344
345     if (size > 0)
346     {
347         auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
348
349         const auto kernelArgs =
350                 prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
351
352         launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr,
353                         "Domdec GPU Apply F Halo Exchange", kernelArgs);
354     }
355
356     if (pulse_ == 0)
357     {
358         fReadyOnDevice_.markEvent(nonLocalStream_);
359     }
360
361     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
362     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
363 }
364
365
366 void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
367                                                 HaloQuantity          haloQuantity,
368                                                 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
369 {
370
371     void* sendPtr;
372     int   sendSize;
373     void* remotePtr;
374     int   sendRank;
375     int   recvRank;
376
377     if (haloQuantity == HaloQuantity::HaloCoordinates)
378     {
379         sendPtr   = static_cast<void*>(d_sendBuf_);
380         sendSize  = xSendSize_;
381         remotePtr = remoteXPtr_;
382         sendRank  = sendRankX_;
383         recvRank  = recvRankX_;
384
385 #if GMX_MPI
386         // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
387         // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
388         // Similarly send event to task that will push data to this task.
389         GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
390         MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*), MPI_BYTE,
391                      recvRank, 0, &remoteCoordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*),
392                      MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
393         remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
394 #else
395         GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
396 #endif
397     }
398     else
399     {
400         sendPtr   = static_cast<void*>(&(d_ptr[atomOffset_]));
401         sendSize  = fSendSize_;
402         remotePtr = remoteFPtr_;
403         sendRank  = sendRankF_;
404         recvRank  = recvRankF_;
405     }
406
407     communicateHaloDataWithCudaDirect(sendPtr, sendSize, sendRank, remotePtr, recvRank);
408 }
409
410 void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
411                                                               int   sendSize,
412                                                               int   sendRank,
413                                                               void* remotePtr,
414                                                               int   recvRank)
415 {
416
417     cudaError_t stat;
418
419     // We asynchronously push data to remote rank. The remote
420     // destination pointer has already been set in the init fn.  We
421     // don't need to worry about overwriting data the remote ranks
422     // still needs since the halo exchange is just done once per
423     // timestep, for each of X and F.
424
425     // send data to neighbor, if any data exists to send
426     if (sendSize > 0)
427     {
428         stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
429                                cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
430
431         CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
432     }
433
434 #if GMX_MPI
435     // ensure pushed data has arrived before remote rank progresses
436     // This rank records an event and sends it to the remote rank which has just been pushed data.
437     // This rank recieves event from remote rank which has pushed data here, and enqueues that event
438     // to its stream.
439     GpuEventSynchronizer* haloDataTransferRemote;
440
441     haloDataTransferLaunched_->markEvent(nonLocalStream_);
442
443     MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), MPI_BYTE, sendRank, 0,
444                  &haloDataTransferRemote, sizeof(GpuEventSynchronizer*), MPI_BYTE, recvRank, 0,
445                  mpi_comm_mysim_, MPI_STATUS_IGNORE);
446
447     haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
448 #else
449     GMX_UNUSED_VALUE(sendRank);
450     GMX_UNUSED_VALUE(recvRank);
451 #endif
452 }
453
454 GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
455 {
456     return &fReadyOnDevice_;
457 }
458
459 /*! \brief Create Domdec GPU object */
460 GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
461                             int                  dimIndex,
462                             MPI_Comm             mpi_comm_mysim,
463                             const DeviceContext& deviceContext,
464                             const DeviceStream&  localStream,
465                             const DeviceStream&  nonLocalStream,
466                             int                  pulse,
467                             gmx_wallcycle*       wcycle) :
468     dd_(dd),
469     sendRankX_(dd->neighbor[dimIndex][1]),
470     recvRankX_(dd->neighbor[dimIndex][0]),
471     sendRankF_(dd->neighbor[dimIndex][0]),
472     recvRankF_(dd->neighbor[dimIndex][1]),
473     usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
474     haloDataTransferLaunched_(new GpuEventSynchronizer()),
475     mpi_comm_mysim_(mpi_comm_mysim),
476     deviceContext_(deviceContext),
477     localStream_(localStream),
478     nonLocalStream_(nonLocalStream),
479     dimIndex_(dimIndex),
480     pulse_(pulse),
481     wcycle_(wcycle)
482 {
483
484     GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
485                        "GPU Halo exchange is currently only supported with thread-MPI enabled");
486
487     if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
488     {
489         gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
490     }
491
492     changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
493
494     allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
495 }
496
497 GpuHaloExchange::Impl::~Impl()
498 {
499     freeDeviceBuffer(&d_indexMap_);
500     freeDeviceBuffer(&d_sendBuf_);
501     freeDeviceBuffer(&d_recvBuf_);
502     freeDeviceBuffer(&d_fShift_);
503     delete haloDataTransferLaunched_;
504 }
505
506 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
507                                  int                  dimIndex,
508                                  MPI_Comm             mpi_comm_mysim,
509                                  const DeviceContext& deviceContext,
510                                  const DeviceStream&  localStream,
511                                  const DeviceStream&  nonLocalStream,
512                                  int                  pulse,
513                                  gmx_wallcycle*       wcycle) :
514     impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
515 {
516 }
517
518 GpuHaloExchange::GpuHaloExchange(GpuHaloExchange&&) noexcept = default;
519
520 GpuHaloExchange& GpuHaloExchange::operator=(GpuHaloExchange&& other) noexcept
521 {
522     std::swap(impl_, other.impl_);
523     return *this;
524 }
525
526 GpuHaloExchange::~GpuHaloExchange() = default;
527
528 void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
529 {
530     impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
531 }
532
533 void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
534                                                  GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
535 {
536     impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
537 }
538
539 void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
540 {
541     impl_->communicateHaloForces(accumulateForces);
542 }
543
544 GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
545 {
546     return impl_->getForcesReadyOnDeviceEvent();
547 }
548 } // namespace gmx