From: Artem Zhmurov Date: Wed, 24 Feb 2021 21:44:28 +0000 (+0300) Subject: Add device-to-device copy function wrapper with tests in CUDA X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6d6deb9ee4b88d01be90eebd30cff60a6ed69412;p=alexxy%2Fgromacs.git Add device-to-device copy function wrapper with tests in CUDA The D2D copy is currently only used in CUDA, so only stubs are added for OpenCL and SYCL. Closes #3321 Refs #3318 --- diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh index a28585bdf5..0096450d64 100644 --- a/src/gromacs/gpu_utils/devicebuffer.cuh +++ b/src/gromacs/gpu_utils/devicebuffer.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -210,6 +210,59 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, } } +/*! \brief + * Performs the device-to-device data copy, synchronous or asynchronously on request. + * + * \tparam ValueType Raw value type of the \p buffer. + * \param[in,out] destinationDeviceBuffer Device-side buffer to copy to + * \param[in] sourceDeviceBuffer Device-side buffer to copy from + * \param[in] numValues Number of values to copy. + * \param[in] deviceStream GPU stream to perform asynchronous copy in. + * \param[in] transferKind Copy type: synchronous or asynchronous. + * \param[out] timingEvent A dummy pointer to the D2D copy timing event to be filled + * in. Not used in CUDA implementation. + */ +template +void copyBetweenDeviceBuffers(DeviceBuffer* destinationDeviceBuffer, + DeviceBuffer* sourceDeviceBuffer, + size_t numValues, + const DeviceStream& deviceStream, + GpuApiCallBehavior transferKind, + CommandEvent* /*timingEvent*/) +{ + if (numValues == 0) + { + return; + } + GMX_ASSERT(destinationDeviceBuffer, "needs a destination buffer pointer"); + GMX_ASSERT(sourceDeviceBuffer, "needs a source buffer pointer"); + + cudaError_t stat; + const size_t bytes = numValues * sizeof(ValueType); + switch (transferKind) + { + case GpuApiCallBehavior::Async: + stat = cudaMemcpyAsync(*destinationDeviceBuffer, + *sourceDeviceBuffer, + bytes, + cudaMemcpyDeviceToDevice, + deviceStream.stream()); + GMX_RELEASE_ASSERT( + stat == cudaSuccess, + ("Asynchronous D2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str()); + break; + + case GpuApiCallBehavior::Sync: + stat = cudaMemcpy(*destinationDeviceBuffer, *sourceDeviceBuffer, bytes, cudaMemcpyDeviceToDevice); + GMX_RELEASE_ASSERT( + stat == cudaSuccess, + ("Synchronous D2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str()); + break; + + default: throw; + } +} + /*! \brief * Clears the device buffer asynchronously. * diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index ce6600afe3..fe489926c8 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -227,6 +227,23 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, } } +/*! \brief + * Performs the device-to-device data copy, synchronous or asynchronously on request. + * + * \tparam ValueType Raw value type of the \p buffer. + */ +template +void copyBetweenDeviceBuffers(DeviceBuffer* /* destinationDeviceBuffer */, + DeviceBuffer* /* sourceDeviceBuffer */, + size_t /* numValues */, + const DeviceStream& /* deviceStream */, + GpuApiCallBehavior /* transferKind */, + CommandEvent* /*timingEvent*/) +{ + // OpenCL-TODO + gmx_fatal(FARGS, "D2D copy stub was called. Not yet implemented in OpenCL."); +} + /*! \brief * Clears the device buffer asynchronously. * diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h index 222f08c20b..ff83c6d27c 100644 --- a/src/gromacs/gpu_utils/devicebuffer_sycl.h +++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h @@ -55,6 +55,7 @@ #include "gromacs/gpu_utils/gmxsycl.h" #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior #include "gromacs/gpu_utils/gputraits_sycl.h" +#include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/stringutil.h" @@ -355,6 +356,23 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, } } +/*! \brief + * Performs the device-to-device data copy, synchronous or asynchronously on request. + * + * \tparam ValueType Raw value type of the \p buffer. + */ +template +void copyBetweenDeviceBuffers(DeviceBuffer* /* destinationDeviceBuffer */, + DeviceBuffer* /* sourceDeviceBuffer */, + size_t /* numValues */, + const DeviceStream& /* deviceStream */, + GpuApiCallBehavior /* transferKind */, + CommandEvent* /*timingEvent*/) +{ + // SYCL-TODO + gmx_fatal(FARGS, "D2D copy stub was called. Not yet implemented in SYCL."); +} + namespace gmx::internal { diff --git a/src/gromacs/gpu_utils/tests/device_buffer.cpp b/src/gromacs/gpu_utils/tests/device_buffer.cpp index 56bbd91afc..8cddef033d 100644 --- a/src/gromacs/gpu_utils/tests/device_buffer.cpp +++ b/src/gromacs/gpu_utils/tests/device_buffer.cpp @@ -237,6 +237,64 @@ TYPED_TEST(DeviceBufferTest, CanCopyToAndFromDeviceWithOffset) } } +# if GMX_GPU_CUDA + +TYPED_TEST(DeviceBufferTest, CanCopyBetweenDeviceBuffers) +{ + for (auto transferKind : { GpuApiCallBehavior::Sync, GpuApiCallBehavior::Async }) + { + PinningPolicy pinningPolicy = (transferKind == GpuApiCallBehavior::Async) + ? PinningPolicy::PinnedIfSupported + : PinningPolicy::CannotBePinned; + for (const auto& testDeviceIn : getTestHardwareEnvironment()->getTestDeviceList()) + { + for (const auto& testDeviceOut : getTestHardwareEnvironment()->getTestDeviceList()) + { + int numValues = 321; + HostVector valuesIn(numValues, { pinningPolicy }); + HostVector valuesOut(numValues, { pinningPolicy }); + + std::iota(valuesIn.begin(), valuesIn.end(), c_initialValue); + + const DeviceContext& deviceContextIn = testDeviceIn->deviceContext(); + const DeviceStream& deviceStreamIn = testDeviceIn->deviceStream(); + setActiveDevice(testDeviceIn->deviceInfo()); + DeviceBuffer bufferIn; + allocateDeviceBuffer(&bufferIn, numValues, deviceContextIn); + + const DeviceContext& deviceContextOut = testDeviceOut->deviceContext(); + const DeviceStream& deviceStreamOut = testDeviceOut->deviceStream(); + setActiveDevice(testDeviceOut->deviceInfo()); + DeviceBuffer bufferOut; + allocateDeviceBuffer(&bufferOut, numValues, deviceContextOut); + + copyToDeviceBuffer( + &bufferIn, valuesIn.data(), 0, numValues, deviceStreamIn, transferKind, nullptr); + copyBetweenDeviceBuffers( + &bufferOut, &bufferIn, numValues, deviceStreamIn, transferKind, nullptr); + if (transferKind == GpuApiCallBehavior::Async) + { + deviceStreamIn.synchronize(); + } + copyFromDeviceBuffer( + valuesOut.data(), &bufferOut, 0, numValues, deviceStreamOut, transferKind, nullptr); + if (transferKind == GpuApiCallBehavior::Async) + { + deviceStreamOut.synchronize(); + } + EXPECT_THAT(valuesOut, Pointwise(Eq(), valuesIn)) + << "Changed after H2D, D2D and D2H " << enumValueToString(transferKind) + << " copy."; + freeDeviceBuffer(&bufferIn); + freeDeviceBuffer(&bufferOut); + } + } + } +} + +# endif // GMX_GPU_CUDA + + } // namespace } // namespace test } // namespace gmx