From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 24 Feb 2021 21:44:28 +0000 (+0300)
Subject: Add device-to-device copy function wrapper with tests in CUDA
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=6d6deb9ee4b88d01be90eebd30cff60a6ed69412;p=alexxy%2Fgromacs.git

Add device-to-device copy function wrapper with tests in CUDA

The D2D copy is currently only used in CUDA, so only stubs are added for OpenCL and SYCL.

Closes #3321

Refs #3318
---

diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh
index a28585bdf5..0096450d64 100644
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -210,6 +210,59 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
     }
 }
 
+/*! \brief
+ * Performs the device-to-device data copy, synchronous or asynchronously on request.
+ *
+ * \tparam        ValueType                Raw value type of the \p buffer.
+ * \param[in,out] destinationDeviceBuffer  Device-side buffer to copy to
+ * \param[in]     sourceDeviceBuffer       Device-side buffer to copy from
+ * \param[in]     numValues                Number of values to copy.
+ * \param[in]     deviceStream             GPU stream to perform asynchronous copy in.
+ * \param[in]     transferKind             Copy type: synchronous or asynchronous.
+ * \param[out]    timingEvent              A dummy pointer to the D2D copy timing event to be filled
+ * in. Not used in CUDA implementation.
+ */
+template<typename ValueType>
+void copyBetweenDeviceBuffers(DeviceBuffer<ValueType>* destinationDeviceBuffer,
+                              DeviceBuffer<ValueType>* sourceDeviceBuffer,
+                              size_t                   numValues,
+                              const DeviceStream&      deviceStream,
+                              GpuApiCallBehavior       transferKind,
+                              CommandEvent* /*timingEvent*/)
+{
+    if (numValues == 0)
+    {
+        return;
+    }
+    GMX_ASSERT(destinationDeviceBuffer, "needs a destination buffer pointer");
+    GMX_ASSERT(sourceDeviceBuffer, "needs a source buffer pointer");
+
+    cudaError_t  stat;
+    const size_t bytes = numValues * sizeof(ValueType);
+    switch (transferKind)
+    {
+        case GpuApiCallBehavior::Async:
+            stat = cudaMemcpyAsync(*destinationDeviceBuffer,
+                                   *sourceDeviceBuffer,
+                                   bytes,
+                                   cudaMemcpyDeviceToDevice,
+                                   deviceStream.stream());
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Asynchronous D2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
+            break;
+
+        case GpuApiCallBehavior::Sync:
+            stat = cudaMemcpy(*destinationDeviceBuffer, *sourceDeviceBuffer, bytes, cudaMemcpyDeviceToDevice);
+            GMX_RELEASE_ASSERT(
+                    stat == cudaSuccess,
+                    ("Synchronous D2D copy failed. " + gmx::getDeviceErrorString(stat)).c_str());
+            break;
+
+        default: throw;
+    }
+}
+
 /*! \brief
  * Clears the device buffer asynchronously.
  *
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h
index ce6600afe3..fe489926c8 100644
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -227,6 +227,23 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
     }
 }
 
+/*! \brief
+ * Performs the device-to-device data copy, synchronous or asynchronously on request.
+ *
+ * \tparam        ValueType                Raw value type of the \p buffer.
+ */
+template<typename ValueType>
+void copyBetweenDeviceBuffers(DeviceBuffer<ValueType>* /* destinationDeviceBuffer */,
+                              DeviceBuffer<ValueType>* /* sourceDeviceBuffer */,
+                              size_t /* numValues */,
+                              const DeviceStream& /* deviceStream */,
+                              GpuApiCallBehavior /* transferKind */,
+                              CommandEvent* /*timingEvent*/)
+{
+    // OpenCL-TODO
+    gmx_fatal(FARGS, "D2D copy stub was called. Not yet implemented in OpenCL.");
+}
+
 /*! \brief
  * Clears the device buffer asynchronously.
  *
diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h
index 222f08c20b..ff83c6d27c 100644
--- a/src/gromacs/gpu_utils/devicebuffer_sycl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h
@@ -55,6 +55,7 @@
 #include "gromacs/gpu_utils/gmxsycl.h"
 #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
 #include "gromacs/gpu_utils/gputraits_sycl.h"
+#include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/stringutil.h"
 
@@ -355,6 +356,23 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
     }
 }
 
+/*! \brief
+ * Performs the device-to-device data copy, synchronous or asynchronously on request.
+ *
+ * \tparam        ValueType                Raw value type of the \p buffer.
+ */
+template<typename ValueType>
+void copyBetweenDeviceBuffers(DeviceBuffer<ValueType>* /* destinationDeviceBuffer */,
+                              DeviceBuffer<ValueType>* /* sourceDeviceBuffer */,
+                              size_t /* numValues */,
+                              const DeviceStream& /* deviceStream */,
+                              GpuApiCallBehavior /* transferKind */,
+                              CommandEvent* /*timingEvent*/)
+{
+    // SYCL-TODO
+    gmx_fatal(FARGS, "D2D copy stub was called. Not yet implemented in SYCL.");
+}
+
 
 namespace gmx::internal
 {
diff --git a/src/gromacs/gpu_utils/tests/device_buffer.cpp b/src/gromacs/gpu_utils/tests/device_buffer.cpp
index 56bbd91afc..8cddef033d 100644
--- a/src/gromacs/gpu_utils/tests/device_buffer.cpp
+++ b/src/gromacs/gpu_utils/tests/device_buffer.cpp
@@ -237,6 +237,64 @@ TYPED_TEST(DeviceBufferTest, CanCopyToAndFromDeviceWithOffset)
     }
 }
 
+#    if GMX_GPU_CUDA
+
+TYPED_TEST(DeviceBufferTest, CanCopyBetweenDeviceBuffers)
+{
+    for (auto transferKind : { GpuApiCallBehavior::Sync, GpuApiCallBehavior::Async })
+    {
+        PinningPolicy pinningPolicy = (transferKind == GpuApiCallBehavior::Async)
+                                              ? PinningPolicy::PinnedIfSupported
+                                              : PinningPolicy::CannotBePinned;
+        for (const auto& testDeviceIn : getTestHardwareEnvironment()->getTestDeviceList())
+        {
+            for (const auto& testDeviceOut : getTestHardwareEnvironment()->getTestDeviceList())
+            {
+                int                   numValues = 321;
+                HostVector<TypeParam> valuesIn(numValues, { pinningPolicy });
+                HostVector<TypeParam> valuesOut(numValues, { pinningPolicy });
+
+                std::iota(valuesIn.begin(), valuesIn.end(), c_initialValue<TypeParam>);
+
+                const DeviceContext& deviceContextIn = testDeviceIn->deviceContext();
+                const DeviceStream&  deviceStreamIn  = testDeviceIn->deviceStream();
+                setActiveDevice(testDeviceIn->deviceInfo());
+                DeviceBuffer<TypeParam> bufferIn;
+                allocateDeviceBuffer(&bufferIn, numValues, deviceContextIn);
+
+                const DeviceContext& deviceContextOut = testDeviceOut->deviceContext();
+                const DeviceStream&  deviceStreamOut  = testDeviceOut->deviceStream();
+                setActiveDevice(testDeviceOut->deviceInfo());
+                DeviceBuffer<TypeParam> bufferOut;
+                allocateDeviceBuffer(&bufferOut, numValues, deviceContextOut);
+
+                copyToDeviceBuffer(
+                        &bufferIn, valuesIn.data(), 0, numValues, deviceStreamIn, transferKind, nullptr);
+                copyBetweenDeviceBuffers(
+                        &bufferOut, &bufferIn, numValues, deviceStreamIn, transferKind, nullptr);
+                if (transferKind == GpuApiCallBehavior::Async)
+                {
+                    deviceStreamIn.synchronize();
+                }
+                copyFromDeviceBuffer(
+                        valuesOut.data(), &bufferOut, 0, numValues, deviceStreamOut, transferKind, nullptr);
+                if (transferKind == GpuApiCallBehavior::Async)
+                {
+                    deviceStreamOut.synchronize();
+                }
+                EXPECT_THAT(valuesOut, Pointwise(Eq(), valuesIn))
+                        << "Changed after H2D, D2D and D2H " << enumValueToString(transferKind)
+                        << " copy.";
+                freeDeviceBuffer(&bufferIn);
+                freeDeviceBuffer(&bufferOut);
+            }
+        }
+    }
+}
+
+#    endif // GMX_GPU_CUDA
+
+
 } // namespace
 } // namespace test
 } // namespace gmx