src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Definitions of interfaces for GPU state data propagator object.
  38  *
  39  * \author Artem Zhmurov <zhmurov@gmail.com>
  40  *
  41  * \ingroup module_mdtypes
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "config.h"
  46
  47 #if GMX_GPU != GMX_GPU_NONE
  48
  49 #    if GMX_GPU == GMX_GPU_CUDA
  50 #        include "gromacs/gpu_utils/cudautils.cuh"
  51 #    endif
  52 #    include "gromacs/gpu_utils/devicebuffer.h"
  53 #    include "gromacs/gpu_utils/gputraits.h"
  54 #    if GMX_GPU == GMX_GPU_OPENCL
  55 #        include "gromacs/gpu_utils/oclutils.h"
  56 #    endif
  57 #    include "gromacs/math/vectypes.h"
  58 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
  59 #    include "gromacs/timing/wallcycle.h"
  60 #    include "gromacs/utility/classhelpers.h"
  61
  62 #    include "state_propagator_data_gpu_impl.h"
  63
  64
  65 namespace gmx
  66 {
  67
  68 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
  69                                    const DeviceStream*  localStream,
  70                                    const DeviceStream*  nonLocalStream,
  71                                    const DeviceContext& deviceContext,
  72                                    GpuApiCallBehavior   transferKind,
  73                                    int                  paddingSize,
  74                                    gmx_wallcycle*       wcycle) :
  75     deviceContext_(deviceContext),
  76     transferKind_(transferKind),
  77     paddingSize_(paddingSize),
  78     wcycle_(wcycle)
  79 {
  80     static_assert(GMX_GPU != GMX_GPU_NONE,
  81                   "This object should only be constructed on the GPU code-paths.");
  82
  83     // TODO: Refactor when the StreamManager is introduced.
  84     if (GMX_GPU == GMX_GPU_OPENCL)
  85     {
  86         GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
  87
  88         // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
  89         pmeStream_    = pmeStream;
  90         updateStream_ = pmeStream;
  91         GMX_UNUSED_VALUE(localStream);
  92         GMX_UNUSED_VALUE(nonLocalStream);
  93     }
  94
  95     if (GMX_GPU == GMX_GPU_CUDA)
  96     {
  97         if (pmeStream != nullptr)
  98         {
  99             pmeStream_ = pmeStream;
 100         }
 101         if (localStream != nullptr)
 102         {
 103             localStream_ = localStream;
 104         }
 105         if (nonLocalStream != nullptr)
 106         {
 107             nonLocalStream_ = nonLocalStream;
 108         }
 109
 110         // TODO: The update stream should be created only when it is needed.
 111 #    if (GMX_GPU == GMX_GPU_CUDA)
 112         cudaError_t  stat;
 113         cudaStream_t stream;
 114         stat = cudaStreamCreate(&stream);
 115         updateStreamOwn_.setStream(stream);
 116         updateStream_ = &updateStreamOwn_;
 117         CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
 118 #    endif
 119     }
 120
 121     // Map the atom locality to the stream that will be used for coordinates,
 122     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
 123     // Note, that nullptr stream is used here to indicate that the copy is not supported.
 124     xCopyStreams_[AtomLocality::Local]    = updateStream_;
 125     xCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
 126     xCopyStreams_[AtomLocality::All]      = updateStream_;
 127
 128     vCopyStreams_[AtomLocality::Local]    = updateStream_;
 129     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
 130     vCopyStreams_[AtomLocality::All]      = updateStream_;
 131
 132     fCopyStreams_[AtomLocality::Local]    = localStream_;
 133     fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
 134     fCopyStreams_[AtomLocality::All]      = updateStream_;
 135 }
 136
 137 StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
 138                                    const DeviceContext& deviceContext,
 139                                    GpuApiCallBehavior   transferKind,
 140                                    int                  paddingSize,
 141                                    gmx_wallcycle*       wcycle) :
 142     deviceContext_(deviceContext),
 143     transferKind_(transferKind),
 144     paddingSize_(paddingSize),
 145     wcycle_(wcycle)
 146 {
 147     static_assert(GMX_GPU != GMX_GPU_NONE,
 148                   "This object should only be constructed on the GPU code-paths.");
 149
 150     GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
 151     pmeStream_      = pmeStream;
 152     localStream_    = pmeStream; // For clearing the force buffer
 153     nonLocalStream_ = nullptr;
 154     updateStream_   = nullptr;
 155
 156
 157     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
 158     // This it temporary measure to make it safe to use this class in those cases.
 159     xCopyStreams_[AtomLocality::Local]    = pmeStream_;
 160     xCopyStreams_[AtomLocality::NonLocal] = nullptr;
 161     xCopyStreams_[AtomLocality::All]      = pmeStream_;
 162
 163     vCopyStreams_[AtomLocality::Local]    = nullptr;
 164     vCopyStreams_[AtomLocality::NonLocal] = nullptr;
 165     vCopyStreams_[AtomLocality::All]      = nullptr;
 166
 167     fCopyStreams_[AtomLocality::Local]    = nullptr;
 168     fCopyStreams_[AtomLocality::NonLocal] = nullptr;
 169     fCopyStreams_[AtomLocality::All]      = nullptr;
 170 }
 171
 172 StatePropagatorDataGpu::Impl::~Impl() {}
 173
 174 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
 175 {
 176     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 177     wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 178
 179     numAtomsLocal_ = numAtomsLocal;
 180     numAtomsAll_   = numAtomsAll;
 181
 182     int numAtomsPadded;
 183     if (paddingSize_ > 0)
 184     {
 185         numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1) / paddingSize_) * paddingSize_;
 186     }
 187     else
 188     {
 189         numAtomsPadded = numAtomsAll_;
 190     }
 191
 192     reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
 193
 194     const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
 195     if (paddingAllocationSize > 0)
 196     {
 197         // The PME stream is used here because the padding region of d_x_ is only in the PME task.
 198         clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, *pmeStream_);
 199     }
 200
 201     reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
 202     const int d_fOldCapacity = d_fCapacity_;
 203     reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
 204     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
 205     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
 206     // since the force buffer ops are not implemented in OpenCL.
 207     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
 208     {
 209         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
 210     }
 211
 212     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 213     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 214 }
 215
 216 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
 217 {
 218     int atomsStartAt   = 0;
 219     int numAtomsToCopy = 0;
 220     switch (atomLocality)
 221     {
 222         case AtomLocality::All:
 223             atomsStartAt   = 0;
 224             numAtomsToCopy = numAtomsAll_;
 225             break;
 226         case AtomLocality::Local:
 227             atomsStartAt   = 0;
 228             numAtomsToCopy = numAtomsLocal_;
 229             break;
 230         case AtomLocality::NonLocal:
 231             atomsStartAt   = numAtomsLocal_;
 232             numAtomsToCopy = numAtomsAll_ - numAtomsLocal_;
 233             break;
 234         default:
 235             GMX_RELEASE_ASSERT(false,
 236                                "Wrong range of atoms requested in GPU state data manager. Should "
 237                                "be All, Local or NonLocal.");
 238     }
 239     GMX_ASSERT(atomsStartAt >= 0,
 240                "The first elemtnt to copy has negative index. Probably, the GPU propagator state "
 241                "was not initialized.");
 242     GMX_ASSERT(numAtomsToCopy >= 0,
 243                "Number of atoms to copy is negative. Probably, the GPU propagator state was not "
 244                "initialized.");
 245     return std::make_tuple(atomsStartAt, numAtomsToCopy);
 246 }
 247
 248 void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>                   d_data,
 249                                                 const gmx::ArrayRef<const gmx::RVec> h_data,
 250                                                 int                                  dataSize,
 251                                                 AtomLocality                         atomLocality,
 252                                                 const DeviceStream&                  deviceStream)
 253 {
 254     GMX_UNUSED_VALUE(dataSize);
 255
 256     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 257
 258     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
 259
 260     GMX_ASSERT(deviceStream.stream() != nullptr,
 261                "No stream is valid for copying with given atom locality.");
 262     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 263     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 264
 265     int atomsStartAt, numAtomsToCopy;
 266     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 267
 268     if (numAtomsToCopy != 0)
 269     {
 270         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
 271                    "The device allocation is smaller than requested copy range.");
 272         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
 273                    "The host buffer is smaller than the requested copy range.");
 274
 275         copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
 276                            atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
 277     }
 278
 279     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 280     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 281 }
 282
 283 void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
 284                                                   DeviceBuffer<RVec>       d_data,
 285                                                   int                      dataSize,
 286                                                   AtomLocality             atomLocality,
 287                                                   const DeviceStream&      deviceStream)
 288 {
 289     GMX_UNUSED_VALUE(dataSize);
 290
 291     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 292
 293     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
 294
 295     GMX_ASSERT(deviceStream.stream() != nullptr,
 296                "No stream is valid for copying with given atom locality.");
 297     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 298     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 299
 300     int atomsStartAt, numAtomsToCopy;
 301     std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
 302
 303     if (numAtomsToCopy != 0)
 304     {
 305         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
 306                    "The device allocation is smaller than requested copy range.");
 307         GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
 308                    "The host buffer is smaller than the requested copy range.");
 309
 310         copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
 311                              atomsStartAt, numAtomsToCopy, deviceStream, transferKind_, nullptr);
 312     }
 313
 314     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 315     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 316 }
 317
 318 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
 319 {
 320     return d_x_;
 321 }
 322
 323 void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
 324                                                         AtomLocality atomLocality)
 325 {
 326     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 327     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
 328     GMX_ASSERT(deviceStream != nullptr,
 329                "No stream is valid for copying positions with given atom locality.");
 330
 331     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 332     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 333
 334     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
 335
 336     // markEvent is skipped in OpenCL as:
 337     //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
 338     //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
 339     // TODO: remove this by adding an event-mark free flavor of this function
 340     if (GMX_GPU == GMX_GPU_CUDA)
 341     {
 342         xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 343     }
 344
 345     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 346     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 347 }
 348
 349 GpuEventSynchronizer*
 350 StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
 351                                                                const SimulationWorkload& simulationWork,
 352                                                                const StepWorkload&       stepWork)
 353 {
 354     // The provider of the coordinates may be different for local atoms. If the update is offloaded
 355     // and this is not a neighbor search step, then the consumer needs to wait for the update
 356     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
 357     // the copy event. Non-local coordinates are always provided by the H2D copy.
 358     //
 359     // TODO: This should be reconsidered to support the halo exchange.
 360     //
 361     // In OpenCL no events are used as coordinate sync is not necessary
 362     if (GMX_GPU == GMX_GPU_OPENCL)
 363     {
 364         return nullptr;
 365     }
 366     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
 367     {
 368         return &xUpdatedOnDevice_;
 369     }
 370     else
 371     {
 372         return &xReadyOnDevice_[atomLocality];
 373     }
 374 }
 375
 376 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
 377 {
 378     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 379     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 380     xReadyOnDevice_[atomLocality].waitForEvent();
 381     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 382 }
 383
 384 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
 385 {
 386     return &xUpdatedOnDevice_;
 387 }
 388
 389 void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
 390 {
 391     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 392     const DeviceStream* deviceStream = xCopyStreams_[atomLocality];
 393     GMX_ASSERT(deviceStream != nullptr,
 394                "No stream is valid for copying positions with given atom locality.");
 395
 396     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 397     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 398
 399     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
 400     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
 401     xReadyOnHost_[atomLocality].markEvent(*deviceStream);
 402
 403     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 404     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 405 }
 406
 407 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
 408 {
 409     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 410     xReadyOnHost_[atomLocality].waitForEvent();
 411     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 412 }
 413
 414
 415 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
 416 {
 417     return d_v_;
 418 }
 419
 420 void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
 421                                                        AtomLocality atomLocality)
 422 {
 423     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 424     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
 425     GMX_ASSERT(deviceStream != nullptr,
 426                "No stream is valid for copying velocities with given atom locality.");
 427
 428     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 429     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 430
 431     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
 432     vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 433
 434     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 435     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 436 }
 437
 438 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
 439 {
 440     return &vReadyOnDevice_[atomLocality];
 441 }
 442
 443
 444 void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
 445 {
 446     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 447     const DeviceStream* deviceStream = vCopyStreams_[atomLocality];
 448     GMX_ASSERT(deviceStream != nullptr,
 449                "No stream is valid for copying velocities with given atom locality.");
 450
 451     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 452     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 453
 454     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
 455     vReadyOnHost_[atomLocality].markEvent(*deviceStream);
 456
 457     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 458     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 459 }
 460
 461 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
 462 {
 463     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 464     vReadyOnHost_[atomLocality].waitForEvent();
 465     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 466 }
 467
 468
 469 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
 470 {
 471     return d_f_;
 472 }
 473
 474 void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
 475                                                    AtomLocality atomLocality)
 476 {
 477     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 478     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
 479     GMX_ASSERT(deviceStream != nullptr,
 480                "No stream is valid for copying forces with given atom locality.");
 481
 482     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 483     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 484
 485     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
 486     fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 487
 488     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 489     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 490 }
 491
 492 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
 493                                                                                 bool useGpuFBufferOps)
 494 {
 495     if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
 496     {
 497         return &fReducedOnDevice_;
 498     }
 499     else
 500     {
 501         return &fReadyOnDevice_[atomLocality];
 502     }
 503 }
 504
 505 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
 506 {
 507     return &fReducedOnDevice_;
 508 }
 509
 510 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
 511 {
 512     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 513     const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
 514     GMX_ASSERT(deviceStream != nullptr,
 515                "No stream is valid for copying forces with given atom locality.");
 516
 517     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 518     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 519
 520     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
 521     fReadyOnHost_[atomLocality].markEvent(*deviceStream);
 522
 523     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 524     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 525 }
 526
 527 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
 528 {
 529     wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 530     fReadyOnHost_[atomLocality].waitForEvent();
 531     wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
 532 }
 533
 534 const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
 535 {
 536     return updateStream_;
 537 }
 538
 539 int StatePropagatorDataGpu::Impl::numAtomsLocal()
 540 {
 541     return numAtomsLocal_;
 542 }
 543
 544 int StatePropagatorDataGpu::Impl::numAtomsAll()
 545 {
 546     return numAtomsAll_;
 547 }
 548
 549
 550 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
 551                                                const DeviceStream*  localStream,
 552                                                const DeviceStream*  nonLocalStream,
 553                                                const DeviceContext& deviceContext,
 554                                                GpuApiCallBehavior   transferKind,
 555                                                int                  paddingSize,
 556                                                gmx_wallcycle*       wcycle) :
 557     impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
 558 {
 559 }
 560
 561 StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
 562                                                const DeviceContext& deviceContext,
 563                                                GpuApiCallBehavior   transferKind,
 564                                                int                  paddingSize,
 565                                                gmx_wallcycle*       wcycle) :
 566     impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
 567 {
 568 }
 569
 570 StatePropagatorDataGpu::StatePropagatorDataGpu(StatePropagatorDataGpu&& /* other */) noexcept = default;
 571
 572 StatePropagatorDataGpu& StatePropagatorDataGpu::operator=(StatePropagatorDataGpu&& /* other */) noexcept = default;
 573
 574 StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
 575
 576
 577 void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
 578 {
 579     return impl_->reinit(numAtomsLocal, numAtomsAll);
 580 }
 581
 582 std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
 583 {
 584     return impl_->getAtomRangesFromAtomLocality(atomLocality);
 585 }
 586
 587
 588 DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
 589 {
 590     return impl_->getCoordinates();
 591 }
 592
 593 void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
 594                                                   AtomLocality                         atomLocality)
 595 {
 596     return impl_->copyCoordinatesToGpu(h_x, atomLocality);
 597 }
 598
 599 GpuEventSynchronizer*
 600 StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
 601                                                          const SimulationWorkload& simulationWork,
 602                                                          const StepWorkload&       stepWork)
 603 {
 604     return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
 605 }
 606
 607 void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
 608 {
 609     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
 610 }
 611
 612 GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice()
 613 {
 614     return impl_->xUpdatedOnDevice();
 615 }
 616
 617 void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
 618 {
 619     return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
 620 }
 621
 622 void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
 623 {
 624     return impl_->waitCoordinatesReadyOnHost(atomLocality);
 625 }
 626
 627
 628 DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
 629 {
 630     return impl_->getVelocities();
 631 }
 632
 633 void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
 634                                                  AtomLocality                         atomLocality)
 635 {
 636     return impl_->copyVelocitiesToGpu(h_v, atomLocality);
 637 }
 638
 639 GpuEventSynchronizer* StatePropagatorDataGpu::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
 640 {
 641     return impl_->getVelocitiesReadyOnDeviceEvent(atomLocality);
 642 }
 643
 644 void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec> h_v, AtomLocality atomLocality)
 645 {
 646     return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
 647 }
 648
 649 void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
 650 {
 651     return impl_->waitVelocitiesReadyOnHost(atomLocality);
 652 }
 653
 654
 655 DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
 656 {
 657     return impl_->getForces();
 658 }
 659
 660 void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality)
 661 {
 662     return impl_->copyForcesToGpu(h_f, atomLocality);
 663 }
 664
 665 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
 666                                                                           bool useGpuFBufferOps)
 667 {
 668     return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
 669 }
 670
 671 GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
 672 {
 673     return impl_->fReducedOnDevice();
 674 }
 675
 676 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
 677 {
 678     return impl_->copyForcesFromGpu(h_f, atomLocality);
 679 }
 680
 681 void StatePropagatorDataGpu::waitForcesReadyOnHost(AtomLocality atomLocality)
 682 {
 683     return impl_->waitForcesReadyOnHost(atomLocality);
 684 }
 685
 686
 687 const DeviceStream* StatePropagatorDataGpu::getUpdateStream()
 688 {
 689     return impl_->getUpdateStream();
 690 }
 691
 692 int StatePropagatorDataGpu::numAtomsLocal()
 693 {
 694     return impl_->numAtomsLocal();
 695 }
 696
 697 int StatePropagatorDataGpu::numAtomsAll()
 698 {
 699     return impl_->numAtomsAll();
 700 }
 701
 702 } // namespace gmx
 703
 704 #endif // GMX_GPU == GMX_GPU_NONE