Move PBC setup out of the launchKernel(...) in GPU bondeds
authorArtem Zhmurov <zhmurov@gmail.com>
Mon, 27 Apr 2020 14:24:10 +0000 (14:24 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Mon, 27 Apr 2020 14:24:10 +0000 (14:24 +0000)
The PBC setup should not be performed on every step, hence it
should be separated from the force computation launch. This
is a preparation step, that does not change the logic, only the
separates the PBC setup and changes kernel launch signature.

src/gromacs/listed_forces/gpubonded.h
src/gromacs/listed_forces/gpubonded_impl.cpp
src/gromacs/listed_forces/gpubonded_impl.cu
src/gromacs/listed_forces/gpubonded_impl.h
src/gromacs/listed_forces/gpubondedkernels.cu
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/runner.cpp

index e2c114ce4bf65b98fbde7e702c36f9777127fc8b..9ba6d1a7ae40e37042b244c0165d20d67a2c8858 100644 (file)
@@ -51,6 +51,7 @@
 
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/pbcutil/pbc.h"
 #include "gromacs/topology/idef.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/classhelpers.h"
@@ -61,7 +62,6 @@ class DeviceStream;
 struct gmx_enerdata_t;
 struct gmx_ffparams_t;
 struct gmx_mtop_t;
-struct t_forcerec;
 struct t_inputrec;
 struct gmx_wallcycle;
 
@@ -108,8 +108,18 @@ bool inputSupportsGpuBondeds(const t_inputrec& ir, const gmx_mtop_t& mtop, std::
 class GpuBonded
 {
 public:
-    //! Construct the manager with constant data and the stream to use.
+    /*! \brief Construct the manager with constant data and the stream to use.
+     *
+     * \param[in] ffparams                   Force-field parameters.
+     * \param[in] electrostaticsScaleFactor  Scaling factor for the electrostatic potential
+     *                                       (Coulomb constant, multiplied by the Fudge factor).
+     * \param[in] deviceContext              GPU device context (not used in CUDA).
+     * \param[in] deviceStream               GPU device stream.
+     * \param[in] wcycle                     The wallclock counter.
+     *
+     */
     GpuBonded(const gmx_ffparams_t& ffparams,
+              const float           electrostaticsScaleFactor,
               const DeviceContext&  deviceContext,
               const DeviceStream&   deviceStream,
               gmx_wallcycle*        wcycle);
@@ -122,23 +132,67 @@ public:
      * Intended to be called after each neighbour search
      * stage. Copies the bonded interactions assigned to the GPU
      * to device data structures, and updates device buffers that
-     * may have been updated after search. */
+     * may have been updated after search.
+     *
+     * \param[in]     nbnxnAtomOrder  Mapping between rvec and NBNXM formats.
+     * \param[in]     idef            List of interactions to compute.
+     * \param[in]     xqDevice        Device buffer with coordinates and charge in xyzq-format.
+     * \param[in,out] forceDevice     Device buffer with forces.
+     * \param[in,out] fshiftDevice    Device buffer with shift forces.
+     */
     void updateInteractionListsAndDeviceBuffers(ArrayRef<const int>           nbnxnAtomOrder,
                                                 const InteractionDefinitions& idef,
                                                 void*                         xqDevice,
                                                 DeviceBuffer<RVec>            forceDevice,
                                                 DeviceBuffer<RVec>            fshiftDevice);
+    /*! \brief
+     * Update PBC data.
+     *
+     * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
+     *
+     * \param[in] pbcType The type of the periodic boundary.
+     * \param[in] box     The periodic boundary box matrix.
+     * \param[in] canMoleculeSpanPbc  Whether one molecule can have atoms in different PBC cells.
+     */
+    void setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc);
 
     /*! \brief Returns whether there are bonded interactions
-     * assigned to the GPU */
+     * assigned to the GPU
+     *
+     * \returns If the list of interaction has elements.
+     */
     bool haveInteractions() const;
-    /*! \brief Launches bonded kernel on a GPU */
-    void launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box);
-    /*! \brief Launches the transfer of computed bonded energies. */
+
+    /*! \brief Launches bonded kernel on a GPU
+     *
+     * \param[in]  stepWork  Simulation step work to determine if energy/virial are to be computed on this step.
+     */
+    void launchKernel(const gmx::StepWorkload& stepWork);
+
+    /*! \brief Sets the PBC and launches bonded kernel on a GPU
+     *
+     * \param[in] pbcType The type of the periodic boundary.
+     * \param[in] box     The periodic boundary box matrix.
+     * \param[in] canMoleculeSpanPbc  Whether one molecule can have atoms in different PBC cells.
+     * \param[in] stepWork  Simulation step work to determine if energy/virial are to be computed on this step.
+     */
+    void setPbcAndlaunchKernel(PbcType                  pbcType,
+                               const matrix             box,
+                               bool                     canMoleculeSpanPbc,
+                               const gmx::StepWorkload& stepWork);
+
+    /*! \brief Launches the transfer of computed bonded energies.
+     */
     void launchEnergyTransfer();
-    /*! \brief Waits on the energy transfer, and accumulates bonded energies to \c enerd. */
+
+    /*! \brief Waits on the energy transfer, and accumulates bonded energies to \c enerd.
+     *
+     * \param[in,out] The energy data object to add energy terms to.
+     */
     void waitAccumulateEnergyTerms(gmx_enerdata_t* enerd);
-    /*! \brief Clears the device side energy buffer */
+
+    /*! \brief Clears the device side energy buffer
+     */
     void clearEnergies();
 
 private:
index 67eec92cacaabad58a9264039f1b603ea5669a99..07244d8dae8c03471d392e646934349baea4435d 100644 (file)
@@ -161,6 +161,7 @@ class GpuBonded::Impl
 };
 
 GpuBonded::GpuBonded(const gmx_ffparams_t& /* ffparams */,
+                     const float /* electrostaticsScaleFactor */,
                      const DeviceContext& /* deviceContext */,
                      const DeviceStream& /* deviceStream */,
                      gmx_wallcycle* /* wcycle */) :
@@ -178,14 +179,21 @@ void GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int> /* nb
 {
 }
 
+void GpuBonded::setPbc(PbcType /* pbcType */, const matrix /* box */, bool /* canMoleculeSpanPbc */)
+{
+}
+
 bool GpuBonded::haveInteractions() const
 {
     return !impl_;
 }
 
-void GpuBonded::launchKernel(const t_forcerec* /* fr */,
-                             const gmx::StepWorkload& /* stepWork */,
-                             const matrix /* box */)
+void GpuBonded::launchKernel(const gmx::StepWorkload& /* stepWork */) {}
+
+void GpuBonded::setPbcAndlaunchKernel(PbcType /* pbcType */,
+                                      const matrix /* box */,
+                                      bool /* canMoleculeSpanPbc */,
+                                      const gmx::StepWorkload& /* stepWork */)
 {
 }
 
index faa775a0f7d9cd62529b9af9a92dffc480bcc4a1..741244caaf44f7ad22b1cd27983d61ab84129500 100644 (file)
@@ -62,10 +62,13 @@ struct t_forcerec;
 
 namespace gmx
 {
+// Number of CUDA threads in a block
+constexpr static int c_threadsPerBlock = 256;
 
 // ---- GpuBonded::Impl
 
 GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
+                      const float           electrostaticsScaleFactor,
                       const DeviceContext&  deviceContext,
                       const DeviceStream&   deviceStream,
                       gmx_wallcycle*        wcycle) :
@@ -75,6 +78,10 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
     GMX_RELEASE_ASSERT(deviceStream.isValid(),
                        "Can't run GPU version of bonded forces in stream that is not valid.");
 
+    static_assert(c_threadsPerBlock >= SHIFTS,
+                  "Threads per block in GPU bonded must be >= SHIFTS for the virial kernel "
+                  "(calcVir=true)");
+
     wcycle_ = wcycle;
 
     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
@@ -87,17 +94,27 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
     allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
 
-    kernelParams_.d_forceParams = d_forceParams_;
-    kernelParams_.d_xq          = d_xq_;
-    kernelParams_.d_f           = d_f_;
-    kernelParams_.d_fShift      = d_fShift_;
-    kernelParams_.d_vTot        = d_vTot_;
+    kernelParams_.electrostaticsScaleFactor = electrostaticsScaleFactor;
+    kernelParams_.d_forceParams             = d_forceParams_;
+    kernelParams_.d_xq                      = d_xq_;
+    kernelParams_.d_f                       = d_f_;
+    kernelParams_.d_fShift                  = d_fShift_;
+    kernelParams_.d_vTot                    = d_vTot_;
     for (int i = 0; i < numFTypesOnGpu; i++)
     {
         kernelParams_.d_iatoms[i]        = nullptr;
         kernelParams_.fTypeRangeStart[i] = 0;
         kernelParams_.fTypeRangeEnd[i]   = -1;
     }
+
+    int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
+
+    kernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
+    kernelLaunchConfig_.blockSize[1] = 1;
+    kernelLaunchConfig_.blockSize[2] = 1;
+    kernelLaunchConfig_.gridSize[0]  = (fTypeRangeEnd + c_threadsPerBlock) / c_threadsPerBlock;
+    kernelLaunchConfig_.gridSize[1]  = 1;
+    kernelLaunchConfig_.gridSize[2]  = 1;
 }
 
 GpuBonded::Impl::~Impl()
@@ -248,6 +265,9 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
         fTypesCounter++;
     }
 
+    int fTypeRangeEnd               = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
+    kernelLaunchConfig_.gridSize[0] = (fTypeRangeEnd + c_threadsPerBlock) / c_threadsPerBlock;
+
     d_xq_     = static_cast<float4*>(d_xqPtr);
     d_f_      = asFloat3(d_fPtr);
     d_fShift_ = asFloat3(d_fShiftPtr);
@@ -261,6 +281,13 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
     // TODO wallcycle sub stop
 }
 
+void GpuBonded::Impl::setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc)
+{
+    PbcAiuc pbcAiuc;
+    setPbcAiuc(canMoleculeSpanPbc ? numPbcDimensions(pbcType) : 0, box, &pbcAiuc);
+    kernelParams_.pbcAiuc = pbcAiuc;
+}
+
 bool GpuBonded::Impl::haveInteractions() const
 {
     return haveInteractions_;
@@ -316,10 +343,11 @@ void GpuBonded::Impl::clearEnergies()
 // ---- GpuBonded
 
 GpuBonded::GpuBonded(const gmx_ffparams_t& ffparams,
+                     const float           electrostaticsScaleFactor,
                      const DeviceContext&  deviceContext,
                      const DeviceStream&   deviceStream,
                      gmx_wallcycle*        wcycle) :
-    impl_(new Impl(ffparams, deviceContext, deviceStream, wcycle))
+    impl_(new Impl(ffparams, electrostaticsScaleFactor, deviceContext, deviceStream, wcycle))
 {
 }
 
@@ -334,11 +362,25 @@ void GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
     impl_->updateInteractionListsAndDeviceBuffers(nbnxnAtomOrder, idef, d_xq, d_f, d_fShift);
 }
 
+void GpuBonded::setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc)
+{
+    impl_->setPbc(pbcType, box, canMoleculeSpanPbc);
+}
+
 bool GpuBonded::haveInteractions() const
 {
     return impl_->haveInteractions();
 }
 
+void GpuBonded::setPbcAndlaunchKernel(PbcType                  pbcType,
+                                      const matrix             box,
+                                      bool                     canMoleculeSpanPbc,
+                                      const gmx::StepWorkload& stepWork)
+{
+    setPbc(pbcType, box, canMoleculeSpanPbc);
+    launchKernel(stepWork);
+}
+
 void GpuBonded::launchEnergyTransfer()
 {
     impl_->launchEnergyTransfer();
index 32cce2599aae46e2d2ce21f05403d9966cad262d..dacb612308972ada61fe965ec6c1c5190bb621d8 100644 (file)
@@ -81,7 +81,7 @@ struct BondedCudaKernelParameters
     //! Periodic boundary data
     PbcAiuc pbcAiuc;
     //! Scale factor
-    float scaleFactor;
+    float electrostaticsScaleFactor;
     //! The bonded types on GPU
     int fTypesOnGpu[numFTypesOnGpu];
     //! The number of interaction atom (iatom) elements for every function type
@@ -112,12 +112,12 @@ struct BondedCudaKernelParameters
 
         setPbcAiuc(0, boxDummy, &pbcAiuc);
 
-        scaleFactor   = 1.0;
-        d_forceParams = nullptr;
-        d_xq          = nullptr;
-        d_f           = nullptr;
-        d_fShift      = nullptr;
-        d_vTot        = nullptr;
+        electrostaticsScaleFactor = 1.0;
+        d_forceParams             = nullptr;
+        d_xq                      = nullptr;
+        d_f                       = nullptr;
+        d_fShift                  = nullptr;
+        d_vTot                    = nullptr;
     }
 };
 
@@ -127,6 +127,7 @@ class GpuBonded::Impl
 public:
     //! Constructor
     Impl(const gmx_ffparams_t& ffparams,
+         const float           electrostaticsScaleFactor,
          const DeviceContext&  deviceContext,
          const DeviceStream&   deviceStream,
          gmx_wallcycle*        wcycle);
@@ -145,10 +146,20 @@ public:
                                                 void*                         xqDevice,
                                                 DeviceBuffer<RVec>            forceDevice,
                                                 DeviceBuffer<RVec>            fshiftDevice);
+    /*! \brief
+     * Update PBC data.
+     *
+     * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
+     *
+     * \param[in] pbcType The type of the periodic boundary.
+     * \param[in] box     The periodic boundary box matrix.
+     * \param[in] canMoleculeSpanPbc  Whether one molecule can have atoms in different PBC cells.
+     */
+    void setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc);
 
     /*! \brief Launches bonded kernel on a GPU */
     template<bool calcVir, bool calcEner>
-    void launchKernel(const t_forcerec* fr, const matrix box);
+    void launchKernel();
     /*! \brief Returns whether there are bonded interactions
      * assigned to the GPU */
     bool haveInteractions() const;
@@ -191,6 +202,9 @@ private:
     //! Parameters and pointers, passed to the CUDA kernel
     BondedCudaKernelParameters kernelParams_;
 
+    //! GPU kernel launch configuration
+    KernelLaunchConfig kernelLaunchConfig_;
+
     //! \brief Pointer to wallcycle structure.
     gmx_wallcycle* wcycle_;
 };
index 9e2e23bb08fa1bdb237a83eeb03d886e7f7c71b1..d253241ee73b6ef0b54fe3147c3b19f973c88ece 100644 (file)
 #include "gromacs/listed_forces/gpubonded.h"
 #include "gromacs/math/units.h"
 #include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdtypes/forcerec.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/simulation_workload.h"
-#include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
 #include "gromacs/utility/gmxassert.h"
 
@@ -71,9 +69,6 @@
 #    include <limits>
 #endif
 
-// CUDA threads per block
-#define TPB_BONDED 256
-
 /*-------------------------------- CUDA kernels-------------------------------- */
 /*------------------------------------------------------------------------------*/
 
@@ -792,10 +787,10 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_LJ14:
-                    pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
-                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc,
-                                                 kernelParams.pbcAiuc, kernelParams.scaleFactor,
-                                                 &vtotVdw_loc, &vtotElec_loc);
+                    pairs_gpu<calcVir, calcEner>(
+                            fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
+                            kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
+                            kernelParams.electrostaticsScaleFactor, &vtotVdw_loc, &vtotElec_loc);
                     break;
             }
             break;
@@ -826,15 +821,10 @@ __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 
 
 template<bool calcVir, bool calcEner>
-void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
+void GpuBonded::Impl::launchKernel()
 {
     GMX_ASSERT(haveInteractions_,
                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
-    static_assert(TPB_BONDED >= SHIFTS,
-                  "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
-
-    PbcAiuc pbcAiuc;
-    setPbcAiuc(fr->bMolPBC ? numPbcDimensions(fr->pbcType) : 0, box, &pbcAiuc);
 
     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 
@@ -843,38 +833,28 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
         return;
     }
 
-    KernelLaunchConfig config;
-    config.blockSize[0] = TPB_BONDED;
-    config.blockSize[1] = 1;
-    config.blockSize[2] = 1;
-    config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
-    config.gridSize[1]  = 1;
-    config.gridSize[2]  = 1;
-
-    auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
-    kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
-    kernelParams_.pbcAiuc     = pbcAiuc;
+    auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
 
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
+    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &kernelParams_);
 
-    launchGpuKernel(kernelPtr, config, deviceStream_, nullptr, "exec_kernel_gpu<calcVir, calcEner>",
-                    kernelArgs);
+    launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr,
+                    "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
 }
 
-void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
+void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
 {
     if (stepWork.computeEnergy)
     {
         // When we need the energy, we also need the virial
-        impl_->launchKernel<true, true>(fr, box);
+        impl_->launchKernel<true, true>();
     }
     else if (stepWork.computeVirial)
     {
-        impl_->launchKernel<true, false>(fr, box);
+        impl_->launchKernel<true, false>();
     }
     else
     {
-        impl_->launchKernel<false, false>(fr, box);
+        impl_->launchKernel<false, false>();
     }
 }
 
index 054ffdcfd21fb6196640231637c615dabbe1467a..5ff92d9e19401a2ee8f26246cd52f18c96521275 100644 (file)
@@ -1251,7 +1251,7 @@ void do_force(FILE*                               fplog,
         if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
         {
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-            fr->gpuBonded->launchKernel(fr, stepWork, box);
+            fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         }
 
@@ -1348,7 +1348,7 @@ void do_force(FILE*                               fplog,
             if (domainWork.haveGpuBondedWork)
             {
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-                fr->gpuBonded->launchKernel(fr, stepWork, box);
+                fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
             }
 
index f5df72545de9af2d684a5912e4c815f48ac107a6..daec6ce305bb2a94f3d7faa93b24f9807b7fa494 100644 (file)
@@ -1397,7 +1397,7 @@ int Mdrunner::mdrunner()
                                "GPU device stream manager should be valid in order to use GPU "
                                "version of bonded forces.");
             gpuBonded = std::make_unique<GpuBonded>(
-                    mtop.ffparams, deviceStreamManager->context(),
+                    mtop.ffparams, fr->ic->epsfac * fr->fudgeQQ, deviceStreamManager->context(),
                     deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
             fr->gpuBonded = gpuBonded.get();
         }