Eliminate D2D copy in update constraints

author Artem Zhmurov <zhmurov@gmail.com>

Fri, 20 Sep 2019 08:34:25 +0000 (10:34 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 2 Oct 2019 17:07:21 +0000 (19:07 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Fri, 20 Sep 2019 08:34:25 +0000 (10:34 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 2 Oct 2019 17:07:21 +0000 (19:07 +0200)
diff --git a/src/gromacs/mdlib/leapfrog_cuda.cu b/src/gromacs/mdlib/leapfrog_cuda.cu

index e2e9086e80f0f5b1d4fa44d0cc6971baedd5991f..d99ed0cec4bfbfa6a1b853dea01eb84bde7fd5b7 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_cuda.cu
+++ b/src/gromacs/mdlib/leapfrog_cuda.cu
@@ -68,7 +68,10 @@
  namespace gmx
  {
  
-//! Number of CUDA threads in a block
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
  constexpr static int c_threadsPerBlock = 256;
  //! Maximum number of threads in a block (for __launch_bounds__)
  constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
@@ -98,6 +101,9 @@ enum class VelocityScalingType
  };
  
  /*! \brief Main kernel for Leap-Frog integrator.
+ *
+ *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
+ *   further use in constraints.
   *
   *  Each GPU thread works with a single particle. Empty declaration is needed to
   *  avoid "no previous prototype for function" clang warning.
@@ -108,8 +114,8 @@ enum class VelocityScalingType
   * \tparam        numTempScaleValues             The number of different T-couple values.
   * \tparam        velocityScaling                Type of the Parrinello-Rahman velocity rescaling.
   * \param[in]     numAtoms                       Total number of atoms.
- * \param[in]     gm_x                           Coordinates before the timestep
- * \param[out]    gm_xp                          Coordinates after the timestep.
+ * \param[in,out] gm_x                           Coordinates to update upon integration.
+ * \param[out]    gm_xp                          A copy of the coordinates before the integration (for constraints).
   * \param[in,out] gm_v                           Velocities to update.
   * \param[in]     gm_f                           Atomic forces.
   * \param[in]     gm_inverseMasses               Reciprocal masses.
@@ -122,7 +128,7 @@ enum class VelocityScalingType
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  __launch_bounds__(c_maxThreadsPerBlock)
  __global__ void leapfrog_kernel(const int                             numAtoms,
-                                const float3* __restrict__            gm_x,
+                                float3* __restrict__                  gm_x,
                                  float3* __restrict__                  gm_xp,
                                  float3* __restrict__                  gm_v,
                                  const float3* __restrict__            gm_f,
@@ -135,7 +141,7 @@ __global__ void leapfrog_kernel(const int                             numAtoms,
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  __launch_bounds__(c_maxThreadsPerBlock)
  __global__ void leapfrog_kernel(const int                             numAtoms,
-                                const float3* __restrict__            gm_x,
+                                float3* __restrict__                  gm_x,
                                  float3* __restrict__                  gm_xp,
                                  float3* __restrict__                  gm_v,
                                  const float3* __restrict__            gm_f,
@@ -154,6 +160,11 @@ __global__ void leapfrog_kernel(const int                             numAtoms,
          float  im          = gm_inverseMasses[threadIndex];
          float  imdt        = im*dt;
  
+        // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
+        // coordinates before update. This should be taken into account when (if) constraints are applied
+        // after the update: x and xp have to be passed to constraints in the 'wrong' order.
+        gm_xp[threadIndex]  = x;
+
          if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
          {
              float3 vp = v;
@@ -187,7 +198,8 @@ __global__ void leapfrog_kernel(const int                             numAtoms,
  
          x                  += v*dt;
          gm_v[threadIndex]   = v;
-        gm_xp[threadIndex]  = x;
+        gm_x[threadIndex]   = x;
+
      }
      return;
  }
diff --git a/src/gromacs/mdlib/leapfrog_cuda.cuh b/src/gromacs/mdlib/leapfrog_cuda.cuh

index b5b5565500e62adb10bdae67723e868aba64c810..919fc1c9fec1c7164a229dd841c2c9494e3135f0 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_cuda.cuh
+++ b/src/gromacs/mdlib/leapfrog_cuda.cuh
@@ -82,10 +82,10 @@ class LeapFrogCuda
          /*! \brief Integrate
           *
           * Integrates the equation of motion using Leap-Frog algorithm.
-         * Updates coordinates and velocities on the GPU.
+         * Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
           *
-         * \param[in]     d_x                    Initial coordinates.
-         * \param[out]    d_xp                   Place to save the resulting coordinates to.
+         * \param[in,out] d_x                    Coordinates to update
+         * \param[out]    d_xp                   Place to save the values of initial coordinates coordinates to.
           * \param[in,out] d_v                    Velocities (will be updated).
           * \param[in]     d_f                    Forces.
           * \param[in]     dt                     Timestep.
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu

index 30785f9a97320d026105b3f40d78a771bc77483b..de3537b501eb49f153ff030e376056ec5753647a 100644 (file)
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
@@ -102,12 +102,9 @@ void integrateLeapFrogGpu(LeapFrogTestData *testData,
          integrator->integrate(d_x, d_xp, d_v, d_f, testData->timestep_,
                                doTempCouple, testData->kineticEnergyData_.tcstat,
                                doPressureCouple, testData->dtPressureCouple_, testData->velocityScalingMatrix_);
-
-        copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
-        copyToDeviceBuffer(&d_x,    h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      }
  
-    copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+    copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      copyFromDeviceBuffer(h_v,  &d_v,  0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
  
      freeDeviceBuffer(&d_x);
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu

index 9ba96267380acc8efc92e24ce88857a6bf5f9871..a310ae00eda68886013de83572a12f1c50dfc06d 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -85,13 +85,18 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
      // TODO There is no point in having separate virial matrix for constraints
      clear_mat(virial);
  
+    // The integrate should save a copy of the current coordinates in d_xp_ and write updated once into d_x_.
+    // The d_xp_ is only needed by constraints.
      integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt,
                             doTempCouple, tcstat,
                             doPressureCouple, dtPressureCouple, velocityScalingMatrix);
-    lincsCuda_->apply(d_x_, d_xp_,
+    // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
+    // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
+    // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
+    lincsCuda_->apply(d_xp_, d_x_,
                        updateVelocities, d_v_, 1.0/dt,
                        computeVirial, virial);
-    settleCuda_->apply(d_x_, d_xp_,
+    settleCuda_->apply(d_xp_, d_x_,
                         updateVelocities, d_v_, 1.0/dt,
                         computeVirial, virial);
  
@@ -105,9 +110,6 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
          }
      }
  
-    // TODO: This should be eliminated
-    cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice);
-
      return;
  }
author	Artem Zhmurov <zhmurov@gmail.com>
	Fri, 20 Sep 2019 08:34:25 +0000 (10:34 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 2 Oct 2019 17:07:21 +0000 (19:07 +0200)
src/gromacs/mdlib/leapfrog_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_cuda.cuh		patch \| blob \| history
src/gromacs/mdlib/tests/leapfrogtestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cu		patch \| blob \| history