Add method for coordinates scaling into GPU update-constrain class
authorArtem Zhmurov <zhmurov@gmail.com>
Tue, 12 Nov 2019 09:55:02 +0000 (10:55 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 27 Nov 2019 12:20:51 +0000 (13:20 +0100)
The coordinates scaling is needed on the pressure coupling step in
case the box size has changed.

Change-Id: I0bf7c7a2f77c1bd42dd0713f689096eaca409f88

src/gromacs/mdlib/update_constrain_cuda.h
src/gromacs/mdlib/update_constrain_cuda_impl.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdlib/update_constrain_cuda_impl.h

index 90fb3679b3e1e446d737e26ab16c3fae0ad41330..40804bfec1b22b5f0b2fbbef97fe09fc0bf9bc35 100644 (file)
@@ -116,6 +116,15 @@ public:
                    float                             dtPressureCouple,
                    const matrix                      velocityScalingMatrix);
 
+    /*! \brief Scale coordinates on the GPU for the pressure coupling.
+     *
+     * After pressure coupling step, the box size may change. Hence, the coordinates should be
+     * scaled so that all the particles fit in the new box.
+     *
+     * \param[in] scalingMatrix Coordinates scaling matrix.
+     */
+    void scaleCoordinates(const matrix scalingMatrix);
+
     /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
      *
      * \param[in,out]  d_x                 Device buffer with coordinates.
index 3f04316f2e806c9683520938b08852c9ad85566f..e95a33e3e250fd123e2b40e168558b5760219d15 100644 (file)
@@ -82,6 +82,12 @@ void UpdateConstrainCuda::integrate(gmx_unused GpuEventSynchronizer* fReadyOnDev
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
+void UpdateConstrainCuda::scaleCoordinates(gmx_unused const matrix scalingMatrix)
+{
+    GMX_ASSERT(false,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
 void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float> d_x,
                               gmx_unused DeviceBuffer<float> d_v,
                               gmx_unused const DeviceBuffer<float> d_f,
index b6ebffb24fae953ba714d9d3fbe07c1a3b7d09fa..b4930e07ef9d242d9ef9857d444b6cc70814db26 100644 (file)
 
 namespace gmx
 {
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+struct ScalingMatrix
+{
+    float xx, yy, zz, yx, zx, zy;
+};
+
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+        void scaleCoordinates_kernel(const int numAtoms, float3* __restrict__ gm_x, const ScalingMatrix scalingMatrix)
+{
+    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIndex < numAtoms)
+    {
+        float3 x = gm_x[threadIndex];
+
+        x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+        x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+        x.z = scalingMatrix.zz * x.z;
+
+        gm_x[threadIndex] = x;
+    }
+}
 
 void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
                                           const real                        dt,
@@ -111,6 +143,25 @@ void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fRea
     return;
 }
 
+void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+{
+    ScalingMatrix mu;
+    mu.xx = scalingMatrix[XX][XX];
+    mu.yy = scalingMatrix[YY][YY];
+    mu.zz = scalingMatrix[ZZ][ZZ];
+    mu.yx = scalingMatrix[YY][XX];
+    mu.zx = scalingMatrix[ZZ][XX];
+    mu.zy = scalingMatrix[ZZ][YY];
+
+    const auto kernelArgs = prepareGpuKernelArguments(
+            scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+    launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+                    "scaleCoordinates_kernel", kernelArgs);
+    // TODO: Although this only happens on the pressure coupling steps, this synchronization
+    //       can affect the perfornamce if nstpcouple is small.
+    gpuStreamSynchronize(commandStream_);
+}
+
 UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
                                 const gmx_mtop_t&     mtop,
                                 const void*           commandStream,
@@ -125,6 +176,12 @@ UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
     integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
     lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
     settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+
+    coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
+    coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
+    coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
+    coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+    coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
 }
 
 UpdateConstrainCuda::Impl::~Impl() {}
@@ -155,6 +212,9 @@ void UpdateConstrainCuda::Impl::set(DeviceBuffer<float>       d_x,
     integrator_->set(md, numTempScaleValues, md.cTC);
     lincsCuda_->set(idef, md);
     settleCuda_->set(idef, md);
+
+    coordinateScalingKernelLaunchConfig_.gridSize[0] =
+            (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 }
 
 void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
@@ -195,6 +255,11 @@ void UpdateConstrainCuda::integrate(GpuEventSynchronizer*             fReadyOnDe
                      tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
 }
 
+void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
+{
+    impl_->scaleCoordinates(scalingMatrix);
+}
+
 void UpdateConstrainCuda::set(DeviceBuffer<float>       d_x,
                               DeviceBuffer<float>       d_v,
                               const DeviceBuffer<float> d_f,
index 88dcfb69a92aa2931855fc9eb278a1e416bf9b1a..08c3c1feaeaedf3c4de8e5f1fc7871cad69e92ed 100644 (file)
@@ -115,6 +115,15 @@ public:
                    float                             dtPressureCouple,
                    const matrix                      velocityScalingMatrix);
 
+    /*! \brief Scale coordinates on the GPU for the pressure coupling.
+     *
+     * After pressure coupling step, the box size may change. Hence, the coordinates should be
+     * scaled so that all the particles fit in the new box.
+     *
+     * \param[in] scalingMatrix Coordinates scaling matrix.
+     */
+    void scaleCoordinates(const matrix scalingMatrix);
+
     /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
      *
      * \param[in,out]  d_x            Device buffer with coordinates.
@@ -147,6 +156,8 @@ public:
 private:
     //! CUDA stream
     CommandStream commandStream_ = nullptr;
+    //! CUDA kernel launch config
+    KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
 
     //! Periodic boundary data
     PbcAiuc pbcAiuc_;