Merge origin/release-2020 into master

author Paul Bauer <paul.bauer.q@gmail.com>

Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
author Paul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
diff --combined src/gromacs/mdlib/leapfrog_cuda.cu

index 18768c28f29fd59e7647c9a19757ebe34833ea9e,4415158d32ea7d051e16d327afa515c1b4b28934..ac4135960740aea0c45f4e8db8deb27856b2db45
--- 1/src/gromacs/mdlib/leapfrog_cuda.cu
--- 2/src/gromacs/mdlib/leapfrog_cuda.cu
+++ b/src/gromacs/mdlib/leapfrog_cuda.cu
@@@ -111,19 -111,19 +111,19 @@@ enum class VelocityScalingTyp
    *  \todo Check if the force should be set to zero here.
    *  \todo This kernel can also accumulate incidental temperatures for each atom.
    *
-  * \tparam        numTempScaleValues             The number of different T-couple values.
-  * \tparam        velocityScaling                Type of the Parrinello-Rahman velocity rescaling.
-  * \param[in]     numAtoms                       Total number of atoms.
-  * \param[in,out] gm_x                           Coordinates to update upon integration.
-  * \param[out]    gm_xp                          A copy of the coordinates before the integration (for constraints).
-  * \param[in,out] gm_v                           Velocities to update.
-  * \param[in]     gm_f                           Atomic forces.
-  * \param[in]     gm_inverseMasses               Reciprocal masses.
-  * \param[in]     dt                             Timestep.
-  * \param[in]     gm_lambdas                     Temperature scaling factors (one per group)
-  * \param[in]     gm_tempScaleGroups             Mapping of atoms into groups.
-  * \param[in]     dtPressureCouple               Time step for pressure coupling
-  * \param[in]     velocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
+  * \tparam        numTempScaleValues               The number of different T-couple values.
+  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
+  * \param[in]     numAtoms                         Total number of atoms.
+  * \param[in,out] gm_x                             Coordinates to update upon integration.
+  * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
+  * \param[in,out] gm_v                             Velocities to update.
+  * \param[in]     gm_f                             Atomic forces.
+  * \param[in]     gm_inverseMasses                 Reciprocal masses.
+  * \param[in]     dt                               Timestep.
+  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
+  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
+  * \param[in]     dtPressureCouple                 Time step for pressure coupling
+  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
    */
   template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
   __launch_bounds__(c_maxThreadsPerBlock) __global__
@@@ -136,7 -136,7 +136,7 @@@
                                const float dt,
                                const float* __restrict__ gm_lambdas,
                                const unsigned short* __restrict__ gm_tempScaleGroups,
-                              const float3 velocityScalingMatrixDiagonal);
+                              const float3 prVelocityScalingMatrixDiagonal);
   
   template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
   __launch_bounds__(c_maxThreadsPerBlock) __global__
@@@ -149,7 -149,7 +149,7 @@@
                                const float dt,
                                const float* __restrict__ gm_lambdas,
                                const unsigned short* __restrict__ gm_tempScaleGroups,
-                              const float3 velocityScalingMatrixDiagonal)
+                              const float3 prVelocityScalingMatrixDiagonal)
   {
       int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
       if (threadIndex < numAtoms)
@@@ -186,9 -186,9 +186,9 @@@
   
               if (velocityScaling == VelocityScalingType::Diagonal)
               {
-                 vp.x -= velocityScalingMatrixDiagonal.x * v.x;
-                 vp.y -= velocityScalingMatrixDiagonal.y * v.y;
-                 vp.z -= velocityScalingMatrixDiagonal.z * v.z;
+                 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
+                 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
+                 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
               }
   
               v = vp;
@@@ -205,21 -205,28 +205,28 @@@
   
   /*! \brief Select templated kernel.
    *
-  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups.
-  * If zero is passed as an argument, it is assumed that no temperature coupling groups are used.
+  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
+  * whether or not the temperature and(or) pressure coupling is enabled.
    *
-  * \param[in]  numTempScaleValues  Numer of temperature coupling groups in the system
-  * \param[in]  velocityScaling     Type of the Parrinello-Rahman velocity scaling
+  * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
+  *                                    should be selected.
+  * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
+  * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
    *
    * \retrun                         Pointer to CUDA kernel
    */
- inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType velocityScaling)
+ inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
+                                     int                 numTempScaleValues,
+                                     VelocityScalingType prVelocityScalingType)
   {
+     // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
+     GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
+                "Temperature coupling was requested with no temperature coupling groups.");
       auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
   
-     if (velocityScaling == VelocityScalingType::None)
+     if (prVelocityScalingType == VelocityScalingType::None)
       {
-         if (numTempScaleValues == 0)
+         if (!doTemperatureScaling)
           {
               kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
           }
@@@ -231,16 -238,10 +238,10 @@@
           {
               kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
           }
-         else
-         {
-             GMX_RELEASE_ASSERT(false,
-                                "Number of temperature coupling groups should be greater than zero "
-                                "(zero for no coupling).");
-         }
       }
-     else if (velocityScaling == VelocityScalingType::Diagonal)
+     else if (prVelocityScalingType == VelocityScalingType::Diagonal)
       {
-         if (numTempScaleValues == 0)
+         if (!doTemperatureScaling)
           {
               kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
           }
@@@ -252,12 -253,6 +253,6 @@@
           {
               kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
           }
-         else
-         {
-             GMX_RELEASE_ASSERT(false,
-                                "Number of temperature coupling groups should be greater than zero "
-                                "(zero for no coupling).");
-         }
       }
       else
       {
@@@ -272,19 -267,19 +267,19 @@@ void LeapFrogCuda::integrate(const floa
                                float3*                           d_v,
                                const float3*                     d_f,
                                const real                        dt,
-                              const bool                        doTempCouple,
+                              const bool                        doTemperatureScaling,
                                gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                              const bool                        doPressureCouple,
+                              const bool                        doParrinelloRahman,
                                const float                       dtPressureCouple,
-                              const matrix                      velocityScalingMatrix)
+                              const matrix                      prVelocityScalingMatrix)
   {
   
       ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
   
       auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
-     if (doTempCouple || doPressureCouple)
+     if (doTemperatureScaling || doParrinelloRahman)
       {
-         if (doTempCouple)
+         if (doTemperatureScaling)
           {
               GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
                          "Number of temperature scaling factors changed since it was set for the "
@@@ -296,26 -291,28 +291,28 @@@
               copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
                                  commandStream_, GpuApiCallBehavior::Async, nullptr);
           }
-         VelocityScalingType velocityScaling = VelocityScalingType::None;
-         if (doPressureCouple)
+         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+         if (doParrinelloRahman)
           {
-             velocityScaling = VelocityScalingType::Diagonal;
-             GMX_ASSERT(velocityScalingMatrix[YY][XX] == 0 && velocityScalingMatrix[ZZ][XX] == 0
-                                && velocityScalingMatrix[ZZ][YY] == 0 && velocityScalingMatrix[XX][YY] == 0
-                                && velocityScalingMatrix[XX][ZZ] == 0 && velocityScalingMatrix[YY][ZZ] == 0,
+             prVelocityScalingType = VelocityScalingType::Diagonal;
+             GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                                && prVelocityScalingMatrix[ZZ][YY] == 0
+                                && prVelocityScalingMatrix[XX][YY] == 0
+                                && prVelocityScalingMatrix[XX][ZZ] == 0
+                                && prVelocityScalingMatrix[YY][ZZ] == 0,
                          "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
                          "in GPU version of Leap-Frog integrator.");
-             velocityScalingMatrixDiagonal_ =
-                     make_float3(dtPressureCouple * velocityScalingMatrix[XX][XX],
-                                 dtPressureCouple * velocityScalingMatrix[YY][YY],
-                                 dtPressureCouple * velocityScalingMatrix[ZZ][ZZ]);
+             prVelocityScalingMatrixDiagonal_ =
+                     make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                                 dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                                 dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
           }
-         kernelPtr = selectLeapFrogKernelPtr(numTempScaleValues_, velocityScaling);
+         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
       }
   
       const auto kernelArgs = prepareGpuKernelArguments(
               kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
-             &dt, &d_lambdas_, &d_tempScaleGroups_, &velocityScalingMatrixDiagonal_);
+             &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
       launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
   
       return;
@@@ -339,6 -336,11 +336,6 @@@ LeapFrogCuda::~LeapFrogCuda(
       freeDeviceBuffer(&d_inverseMasses_);
   }
   
- -void LeapFrogCuda::setPbc(const t_pbc* pbc)
- -{
- -    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
- -}
- -
   void LeapFrogCuda::set(const t_mdatoms& md, const int numTempScaleValues, const unsigned short* tempScaleGroups)
   {
       numAtoms_                       = md.nr;
diff --combined src/gromacs/mdlib/leapfrog_cuda.cuh

index 60dffc983e3f64272ed3b96caded1f2bd58275ac,cb71267a209ff1619f4dc331a734fdf0ceb783b0..cbf3738a8f82e6270f4363b67b29acba725f4583
--- 1/src/gromacs/mdlib/leapfrog_cuda.cuh
--- 2/src/gromacs/mdlib/leapfrog_cuda.cuh
+++ b/src/gromacs/mdlib/leapfrog_cuda.cuh
@@@ -69,32 -69,41 +69,32 @@@ public
       LeapFrogCuda(CommandStream commandStream);
       ~LeapFrogCuda();
   
- -    /*! \brief
- -     * Update PBC data.
- -     *
- -     * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
- -     *
- -     * \param[in] pbc The PBC data in t_pbc format.
- -     */
- -    void setPbc(const t_pbc* pbc);
- -
       /*! \brief Integrate
        *
        * Integrates the equation of motion using Leap-Frog algorithm.
        * Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
        *
-      * \param[in,out] d_x                    Coordinates to update
-      * \param[out]    d_xp                   Place to save the values of initial coordinates coordinates to.
-      * \param[in,out] d_v                    Velocities (will be updated).
-      * \param[in]     d_f                    Forces.
-      * \param[in]     dt                     Timestep.
-      * \param[in]     doTempCouple           If the temperature coupling should be applied.
-      * \param[in]     tcstat                 Temperature coupling data.
-      * \param[in]     doPressureCouple       If the temperature coupling should be applied.
-      * \param[in]     dtPressureCouple       Period between pressure coupling steps
-      * \param[in]     velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
+      * \param[in,out] d_x                      Coordinates to update
+      * \param[out]    d_xp                     Place to save the values of initial coordinates coordinates to.
+      * \param[in,out] d_v                      Velocities (will be updated).
+      * \param[in]     d_f                      Forces.
+      * \param[in]     dt                       Timestep.
+      * \param[in]     doTemperatureScaling     If velocities should be scaled for temperature coupling.
+      * \param[in]     tcstat                   Temperature coupling data.
+      * \param[in]     doParrinelloRahman       If current step is a Parrinello-Rahman pressure coupling step.
+      * \param[in]     dtPressureCouple         Period between pressure coupling steps
+      * \param[in]     prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
        */
       void integrate(const float3*                     d_x,
                      float3*                           d_xp,
                      float3*                           d_v,
                      const float3*                     d_f,
                      const real                        dt,
-                    const bool                        doTempCouple,
+                    const bool                        doTemperatureScaling,
                      gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                    const bool                        doPressureCouple,
+                    const bool                        doParrinelloRahman,
                      const float                       dtPressureCouple,
-                    const matrix                      velocityScalingMatrix);
+                    const matrix                      prVelocityScalingMatrix);
   
       /*! \brief Set the integrator
        *
@@@ -116,6 -125,8 +116,6 @@@ private
       CommandStream commandStream_;
       //! CUDA kernel launch config
       KernelLaunchConfig kernelLaunchConfig_;
- -    //! Periodic boundary data
- -    PbcAiuc pbcAiuc_;
       //! Number of atoms
       int numAtoms_;
   
@@@ -148,8 -159,8 +148,8 @@@
       //! Maximum size of the temperature coupling groups array
       int numTempScaleGroupsAlloc_ = -1;
   
-     //! Vector with diagonal elements of the pressure coupling velocity rescale factors
-     float3 velocityScalingMatrixDiagonal_;
+     //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
+     float3 prVelocityScalingMatrixDiagonal_;
   };
   
   } // namespace gmx
diff --combined src/gromacs/mdlib/lincs_cuda.cu

index c0caa1278647d4d5eee65a18c47ada36447f5fcb,84325141ad5365b3e9a5200799835f4eac332a8f..ce1b442f84902e61797ac90035188e9309839be1
--- 1/src/gromacs/mdlib/lincs_cuda.cu
--- 2/src/gromacs/mdlib/lincs_cuda.cu
+++ b/src/gromacs/mdlib/lincs_cuda.cu
@@@ -40,6 -40,8 +40,6 @@@
    * using CUDA, including class initialization, data-structures management
    * and GPU kernel.
    *
- - * \note Management of periodic boundary should be unified with SETTLE and
- - *       removed from here.
    * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
    *
    * \author Artem Zhmurov <zhmurov@gmail.com>
@@@ -432,8 -434,7 +432,8 @@@ void LincsCuda::apply(const float3* d_x
                         float3*       d_v,
                         const real    invdt,
                         const bool    computeVirial,
- -                      tensor        virialScaled)
+ +                      tensor        virialScaled,
+ +                      const PbcAiuc pbcAiuc)
   {
       ensureNoPendingCudaError("In CUDA version of LINCS");
   
@@@ -477,8 -478,6 +477,8 @@@
       }
       config.stream = commandStream_;
   
+ +    kernelParams_.pbcAiuc = pbcAiuc;
+ +
       const auto kernelArgs =
               prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
   
@@@ -549,41 -548,84 +549,84 @@@ LincsCuda::~LincsCuda(
   
   /*! \brief Helper function to go through constraints recursively.
    *
-  *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+  *  For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
    *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
-  *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
-  *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+  *  coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
+  *  value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
    *  after it in the constraints array.
    *
-  * \param[in]     a                  Atom index.
-  * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
-  *                                   the number of constraints (i) connected to it and (ii) located
-  *                                   after it in memory. This array is filled by this recursive function.
-  *                                   For a set of coupled constraints, only for the first one in this list
-  *                                   the number of consecutive coupled constraints is needed: if there is
-  *                                   not enough space for this set of constraints in the thread block,
-  *                                   the group has to be moved to the next one.
-  * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
+  * \param[in]     a                   Atom index.
+  * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
+  *                                    the number of constraints (i) connected to it and (ii) located
+  *                                    after it in memory. This array is filled by this recursive function.
+  *                                    For a set of coupled constraints, only for the first one in this list
+  *                                    the number of consecutive coupled constraints is needed: if there is
+  *                                    not enough space for this set of constraints in the thread block,
+  *                                    the group has to be moved to the next one.
+  * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
    */
- inline int countCoupled(int                                                  a,
-                         std::vector<int>*                                    spaceNeeded,
-                         std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
+ inline int countCoupled(int           a,
+                         ArrayRef<int> numCoupledConstraints,
+                         ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
   
   {
-     int c2, a2, sign;
       int counted = 0;
-     for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+     for (const auto& adjacentAtom : atomsAdjacencyList[a])
       {
-         std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
-         if (spaceNeeded->at(c2) == -1)
+         const int c2 = std::get<1>(adjacentAtom);
+         if (numCoupledConstraints[c2] == -1)
           {
-             spaceNeeded->at(c2) = 0; // To indicate we've been here
-             counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+             numCoupledConstraints[c2] = 0; // To indicate we've been here
+             counted += 1 + countCoupled(std::get<0>(adjacentAtom), numCoupledConstraints, atomsAdjacencyList);
           }
       }
       return counted;
   }
   
+ /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
+  *
+  *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
+  *  if it was not yet added. Then goes through all the constraints coupled to \p c
+  *  and calls itself recursively. This ensures that all the coupled constraints will
+  *  be added to neighboring locations in the final data structures on the device,
+  *  hence mapping all coupled constraints to the same thread block. A value of -1 in
+  *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
+  *
+  * \param[in]     iatoms              The list of constraints.
+  * \param[in]     stride              Number of elements per constraint in \p iatoms.
+  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
+  * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
+  * \param[in]     c                   Sequential index for constraint to consider adding.
+  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
+  */
+ inline void addWithCoupled(const t_iatom*                                         iatoms,
+                            const int                                              stride,
+                            ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
+                            ArrayRef<int>                                          splitMap,
+                            const int                                              c,
+                            int*                                                   currentMapIndex)
+ {
+     if (splitMap[c] == -1)
+     {
+         splitMap[c] = *currentMapIndex;
+         (*currentMapIndex)++;
+ 
+         // Constraints, coupled through both atoms.
+         for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
+         {
+             const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
+             for (const auto& adjacentAtom : atomsAdjacencyList[a1])
+             {
+                 const int c2 = std::get<1>(adjacentAtom);
+                 if (c2 != c)
+                 {
+                     addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
+                 }
+             }
+         }
+     }
+ }
+ 
   void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
   {
       int numAtoms = md.nr;
@@@ -624,46 -666,49 +667,49 @@@
           atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
       }
   
-     // Compute, how many coupled constraints are in front of each constraint.
-     // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
-     // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
-     // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
-     // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
-     std::vector<int> spaceNeeded;
-     spaceNeeded.resize(numConstraints, -1);
-     std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+     // Compute, how many constraints are coupled to each constraint.
+     // Needed to introduce splits in data so that all coupled constraints will be computed in a
+     // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
+     // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
+     // first index of the connected group of the constraints is needed later in the code, hence the
+     // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+     std::vector<int> numCoupledConstraints(numConstraints, -1);
       for (int c = 0; c < numConstraints; c++)
       {
           int a1 = iatoms[stride * c + 1];
           int a2 = iatoms[stride * c + 2];
-         if (spaceNeeded.at(c) == -1)
+         if (numCoupledConstraints.at(c) == -1)
           {
-             spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
-                                 + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+             numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+                                           + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
           }
       }
   
       // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
       // takes into account the empty spaces which might be needed in the end of each thread block.
-     std::vector<int> splitMap;
-     splitMap.resize(numConstraints, -1);
-     int currentMapIndex = 0;
+     std::vector<int> splitMap(numConstraints, -1);
+     int              currentMapIndex = 0;
       for (int c = 0; c < numConstraints; c++)
       {
           // Check if coupled constraints all fit in one block
-         GMX_RELEASE_ASSERT(
-                 spaceNeeded.at(c) < c_threadsPerBlock,
-                 "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
-                 "Most likely, you are trying to use GPU version of LINCS with constraints on "
-                 "all-bonds, "
-                 "which is not supported. Try using H-bonds constraints only.");
-         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+         if (numCoupledConstraints.at(c) > c_threadsPerBlock)
+         {
+             gmx_fatal(FARGS,
+                       "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
+                       "thread block (%d). Most likely, you are trying to use the GPU version of "
+                       "LINCS with constraints on all-bonds, which is not supported for large "
+                       "molecules. When compatible with the force field and integration settings, "
+                       "using constraints on H-bonds only.",
+                       numCoupledConstraints.at(c), c_threadsPerBlock);
+         }
+         if (currentMapIndex / c_threadsPerBlock
+             != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
           {
               currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
           }
-         splitMap.at(c) = currentMapIndex;
-         currentMapIndex++;
+         addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
       }
+ 
       kernelParams_.numConstraintsThreads =
               currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
       GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
@@@ -842,9 -887,14 +888,9 @@@
                          maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
                          GpuApiCallBehavior::Sync, nullptr);
   
-     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
       copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
                          GpuApiCallBehavior::Sync, nullptr);
   }
   
- -void LincsCuda::setPbc(const t_pbc* pbc)
- -{
- -    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
- -}
- -
   } // namespace gmx
diff --combined src/gromacs/mdlib/sim_util.cpp

index aa2c7dbadf658d315fe13a2ac4a33c9a5261dacd,c6adc7b73c3a545bca023176f0c268c112a11ec8..692c2570b66f9dfa613e29dae78d4f183685027f
--- 1/src/gromacs/mdlib/sim_util.cpp
--- 2/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@@ -843,7 -843,7 +843,7 @@@ static StepWorkload setupStepWorkload(c
   
   /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
    *
- - * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
+ + * TODO: eliminate \p useGpuPmeOnThisRank when this is
    * incorporated in DomainLifetimeWorkload.
    */
   static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
@@@ -851,11 -851,12 +851,11 @@@
                                       gmx_pme_t*                        pmedata,
                                       gmx_enerdata_t*                   enerd,
                                       const gmx::MdrunScheduleWorkload& runScheduleWork,
- -                                    bool                              useGpuNonbonded,
- -                                    bool                              useGpuPme,
+ +                                    bool                              useGpuPmeOnThisRank,
                                       int64_t                           step,
                                       gmx_wallcycle_t                   wcycle)
   {
- -    if (useGpuNonbonded)
+ +    if (runScheduleWork.simulationWork.useGpuNonbonded)
       {
           /* Launch pruning before buffer clearing because the API overhead of the
            * clear kernel launches can leave the GPU idle while it could be running
@@@ -874,7 -875,7 +874,7 @@@
           wallcycle_stop(wcycle, ewcLAUNCH_GPU);
       }
   
- -    if (useGpuPme)
+ +    if (useGpuPmeOnThisRank)
       {
           pme_gpu_reinit_computation(pmedata, wcycle);
       }
@@@ -1031,10 -1032,11 +1031,11 @@@ void do_force(FILE
           }
       }
   
-     // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
-     // current coordinates are already on the host, hence copy is not needed.
+     // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
+     // the CPU, or for the computation of virial. At search steps the current coordinates are
+     // already on the host, hence copy is not needed.
       if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-         && runScheduleWork->domainWork.haveCpuLocalForceWork)
+         && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
       {
           stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
           stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
@@@ -1133,7 -1135,10 +1134,7 @@@
               }
               wallcycle_stop(wcycle, ewcLAUNCH_GPU);
           }
- -    }
   
- -    if (stepWork.doNeighborSearch)
- -    {
           // Need to run after the GPU-offload bonded interaction lists
           // are set up to be able to determine whether there is bonded work.
           runScheduleWork->domainWork = setupDomainLifetimeWorkload(
@@@ -1470,7 -1475,7 +1471,7 @@@
       if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
       {
           /* Wait for non-local coordinate data to be copied from device */
-         nbv->wait_nonlocal_x_copy_D2H_done();
+         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
       }
       /* Compute the bonded and non-bonded energies and optionally forces */
       do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
@@@ -1666,7 -1671,7 +1667,7 @@@
                                              fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
                           : nullptr; // PME reduction not active on GPU
   
-         gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+         gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
   
           if (stepWork.useGpuPmeFReduction)
           {
@@@ -1699,14 -1704,7 +1700,7 @@@
               }
               if (useGpuForcesHaloExchange)
               {
-                 // Add a stream synchronization to satisfy a dependency
-                 // for the local buffer ops on the result of GPU halo
-                 // exchange, which operates in the non-local stream and
-                 // writes to to local parf og the force buffer.
-                 //
-                 // TODO improve this through use of an event - see Redmine #3093
-                 //      push the event into the dependencyList
-                 nbv->stream_local_wait_for_nonlocal();
+                 dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
               }
               nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                 dependencyList, stepWork.useGpuPmeFReduction,
@@@ -1730,7 -1728,7 +1724,7 @@@
       }
   
       launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
- -                            simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
+ +                            useGpuPmeOnThisRank, step, wcycle);
   
       if (DOMAINDECOMP(cr))
       {
diff --combined src/gromacs/mdlib/update_constrain_cuda_impl.cu

index 3f4109db5401f7cb8e32ea599baf5e46c01be879,add995f26a0322b281321bf1608557f319e417a4..7f38b8d74cc74a1b9917dea8f23c2d17b13b9f33
--- 1/src/gromacs/mdlib/update_constrain_cuda_impl.cu
--- 2/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@@ -67,17 -67,51 +67,51 @@@
   
   namespace gmx
   {
+ /*!\brief Number of CUDA threads in a block
+  *
+  * \todo Check if using smaller block size will lead to better prformance.
+  */
+ constexpr static int c_threadsPerBlock = 256;
+ //! Maximum number of threads in a block (for __launch_bounds__)
+ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+ 
+ /*! \brief Scaling matrix struct.
+  *
+  * \todo Should be generalized.
+  */
+ struct ScalingMatrix
+ {
+     float xx, yy, zz, yx, zx, zy;
+ };
+ 
+ __launch_bounds__(c_maxThreadsPerBlock) __global__
+         static void scaleCoordinates_kernel(const int numAtoms,
+                                             float3* __restrict__ gm_x,
+                                             const ScalingMatrix scalingMatrix)
+ {
+     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+     if (threadIndex < numAtoms)
+     {
+         float3 x = gm_x[threadIndex];
+ 
+         x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+         x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+         x.z = scalingMatrix.zz * x.z;
+ 
+         gm_x[threadIndex] = x;
+     }
+ }
   
   void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
                                             const real                        dt,
                                             const bool                        updateVelocities,
                                             const bool                        computeVirial,
                                             tensor                            virial,
-                                           const bool                        doTempCouple,
+                                           const bool                        doTemperatureScaling,
                                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                           const bool                        doPressureCouple,
+                                           const bool                        doParrinelloRahman,
                                             const float                       dtPressureCouple,
-                                           const matrix                      velocityScalingMatrix)
+                                           const matrix                      prVelocityScalingMatrix)
   {
       // Clearing virial matrix
       // TODO There is no point in having separate virial matrix for constraints
@@@ -88,13 -122,13 +122,13 @@@
   
       // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
       // into d_x_. The d_xp_ is only needed by constraints.
-     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTempCouple, tcstat, doPressureCouple,
-                            dtPressureCouple, velocityScalingMatrix);
+     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
+                            doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
       // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
       // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
       // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
- -    lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
- -    settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
+ +    lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
+ +    settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
   
       // scaledVirial -> virial (methods above returns scaled values)
       float scaleFactor = 0.5f / (dt * dt);
@@@ -111,6 -145,25 +145,25 @@@
       return;
   }
   
+ void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+ {
+     ScalingMatrix mu;
+     mu.xx = scalingMatrix[XX][XX];
+     mu.yy = scalingMatrix[YY][YY];
+     mu.zz = scalingMatrix[ZZ][ZZ];
+     mu.yx = scalingMatrix[YY][XX];
+     mu.zx = scalingMatrix[ZZ][XX];
+     mu.zy = scalingMatrix[ZZ][YY];
+ 
+     const auto kernelArgs = prepareGpuKernelArguments(
+             scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+     launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+                     "scaleCoordinates_kernel", kernelArgs);
+     // TODO: Although this only happens on the pressure coupling steps, this synchronization
+     //       can affect the perfornamce if nstpcouple is small.
+     gpuStreamSynchronize(commandStream_);
+ }
+ 
   UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
                                   const gmx_mtop_t&     mtop,
                                   const void*           commandStream,
@@@ -125,6 -178,12 +178,12 @@@
       integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
       lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
       settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+ 
+     coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
+     coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
+     coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
+     coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+     coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
   }
   
   UpdateConstrainCuda::Impl::~Impl() {}
@@@ -155,11 -214,17 +214,14 @@@ void UpdateConstrainCuda::Impl::set(Dev
       integrator_->set(md, numTempScaleValues, md.cTC);
       lincsCuda_->set(idef, md);
       settleCuda_->set(idef, md);
+ 
+     coordinateScalingKernelLaunchConfig_.gridSize[0] =
+             (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
   }
   
   void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
   {
       setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
- -    integrator_->setPbc(pbc);
- -    lincsCuda_->setPbc(pbc);
- -    settleCuda_->setPbc(pbc);
   }
   
   GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
@@@ -182,14 -247,19 +244,19 @@@ void UpdateConstrainCuda::integrate(Gpu
                                       const bool                        updateVelocities,
                                       const bool                        computeVirial,
                                       tensor                            virialScaled,
-                                     const bool                        doTempCouple,
+                                     const bool                        doTemperatureScaling,
                                       gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                     const bool                        doPressureCouple,
+                                     const bool                        doParrinelloRahman,
                                       const float                       dtPressureCouple,
-                                     const matrix                      velocityScalingMatrix)
+                                     const matrix                      prVelocityScalingMatrix)
+ {
+     impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTemperatureScaling,
+                      tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+ }
+ 
+ void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
   {
-     impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTempCouple,
-                      tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
+     impl_->scaleCoordinates(scalingMatrix);
   }
   
   void UpdateConstrainCuda::set(DeviceBuffer<float>       d_x,
diff --combined src/gromacs/mdrun/runner.cpp

index 5fb17e91995d3ec9951f9443102341be6128be1e,d68a5fec5414746cd2e373289dfcfc7db653f9f3..75a6f6d14cb18b3607a68c2f2cec1135ea7fbe6d
--- 1/src/gromacs/mdrun/runner.cpp
--- 2/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@@ -196,12 -196,12 +196,12 @@@ struct DevelopmentFeatureFlag
    *
    * \param[in]  mdlog                Logger object.
    * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
-  * \param[in]  useGpuForPme         True if the PME task is offloaded in this run.
+  * \param[in]  pmeRunMode           The PME run mode for this run
    * \returns                         The object populated with development feature flags.
    */
   static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
                                                            const bool           useGpuForNonbonded,
-                                                          const bool           useGpuForPme)
+                                                          const PmeRunMode     pmeRunMode)
   {
       DevelopmentFeatureFlags devFlags;
   
@@@ -266,7 -266,7 +266,7 @@@
   
       if (devFlags.enableGpuPmePPComm)
       {
-         if (useGpuForPme)
+         if (pmeRunMode == PmeRunMode::GPU)
           {
               GMX_LOG(mdlog.warning)
                       .asParagraph()
@@@ -276,12 -276,23 +276,23 @@@
           }
           else
           {
+             std::string clarification;
+             if (pmeRunMode == PmeRunMode::Mixed)
+             {
+                 clarification =
+                         "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
+                         "mode).";
+             }
+             else
+             {
+                 clarification = "PME is not offloaded to the GPU.";
+             }
               GMX_LOG(mdlog.warning)
                       .asParagraph()
-                     .appendTextFormatted(
+                     .appendText(
                               "NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
-                             "'GPU PME-PP communications' feature was not enabled as PME is not "
-                             "offloaded to the GPU.");
+                             "'GPU PME-PP communications' feature was not enabled as "
+                             + clarification);
               devFlags.enableGpuPmePPComm = false;
           }
       }
@@@ -889,7 -900,7 +900,7 @@@ int Mdrunner::mdrunner(
       // Initialize development feature flags that enabled by environment variable
       // and report those features that are enabled.
       const DevelopmentFeatureFlags devFlags =
-             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
+             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
   
       // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
       //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
@@@ -1533,7 -1544,7 +1544,7 @@@
               /* This call is not included in init_domain_decomposition mainly
                * because fr->cginfo_mb is set later.
                */
- -            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+ +            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
                               domdecOptions.checkBondedInteractions, fr->cginfo_mb);
           }
   
@@@ -1633,45 -1644,18 +1644,45 @@@
       }
   
       // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
- -    // before we destroy the GPU context(s) in free_gpu_resources().
+ +    // before we destroy the GPU context(s) in free_gpu().
       // Pinned buffers are associated with contexts in CUDA.
       // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
       mdAtoms.reset(nullptr);
       globalState.reset(nullptr);
       mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+ +    /* Free pinned buffers in *fr */
+ +    delete fr;
+ +    fr = nullptr;
+ +
+ +    if (hwinfo->gpu_info.n_dev > 0)
+ +    {
+ +        /* stop the GPU profiler (only CUDA) */
+ +        stopGpuProfiler();
+ +    }
+ +
+ +    /* With tMPI we need to wait for all ranks to finish deallocation before
+ +     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+ +     * GPU and context.
+ +     *
+ +     * This is not a concern in OpenCL where we use one context per rank which
+ +     * is freed in nbnxn_gpu_free().
+ +     *
+ +     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+ +     * but it is easier and more futureproof to call it on the whole node.
+ +     *
+ +     * Note that this function needs to be called even if GPUs are not used
+ +     * in this run because the PME ranks have no knowledge of whether GPUs
+ +     * are used or not, but all ranks need to enter the barrier below.
+ +     * \todo Remove this physical node barrier after making sure
+ +     * that it's not needed anymore (with a shared GPU run).
+ +     */
+ +    if (GMX_THREAD_MPI)
+ +    {
+ +        physicalNodeComm.barrier();
+ +    }
   
- -    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- -    free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
       free_gpu(nonbondedDeviceInfo);
       free_gpu(pmeDeviceInfo);
- -    done_forcerec(fr, mtop.molblock.size());
       sfree(fcd);
   
       if (doMembed)
diff --combined src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index d68e3e77b933d15bfbe2000017a0892f1fbdcd54,f42ad7230e9b220eaca5be3cae2ebb401fa8390f..9d674afd8f5d05c8abdbadeab75010efe7c032ff
--- 1/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
--- 2/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@@ -194,7 -194,15 +194,15 @@@ void StatePropagatorDataGpu::Impl::rein
       }
   
       reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+     const int d_fOldCapacity = d_fCapacity_;
       reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+     // since the force buffer ops are not implemented in OpenCL.
+     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+     {
+         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+     }
   }
   
   std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
@@@ -238,13 -246,8 +246,13 @@@ void StatePropagatorDataGpu::Impl::copy
   
       GMX_UNUSED_VALUE(dataSize);
   
+ +    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ +
       GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
   
+ +    GMX_ASSERT(commandStream != nullptr,
+ +               "No stream is valid for copying with given atom locality.");
+ +
       int atomsStartAt, numAtomsToCopy;
       std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
   
@@@ -272,13 -275,8 +280,13 @@@ void StatePropagatorDataGpu::Impl::copy
   
       GMX_UNUSED_VALUE(dataSize);
   
+ +    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ +
       GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
   
+ +    GMX_ASSERT(commandStream != nullptr,
+ +               "No stream is valid for copying with given atom locality.");
+ +
       int atomsStartAt, numAtomsToCopy;
       std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
   
@@@ -305,7 -303,12 +313,7 @@@ DeviceBuffer<float> StatePropagatorData
   void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
                                                           AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = xCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying positions with given atom locality.");
- -
- -    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+ +    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
   
       // markEvent is skipped in OpenCL as:
       //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
@@@ -313,7 -316,7 +321,7 @@@
       // TODO: remove this by adding an event-mark free flavor of this function
       if (GMX_GPU == GMX_GPU_CUDA)
       {
- -        xReadyOnDevice_[atomLocality].markEvent(commandStream);
+ +        xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
       }
   }
   
@@@ -357,9 -360,14 +365,9 @@@ GpuEventSynchronizer* StatePropagatorDa
   
   void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = xCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying positions with given atom locality.");
- -
- -    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+ +    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
       // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
- -    xReadyOnHost_[atomLocality].markEvent(commandStream);
+ +    xReadyOnHost_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
   }
   
   void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
@@@ -376,8 -384,13 +384,8 @@@ DeviceBuffer<float> StatePropagatorData
   void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
                                                          AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = vCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying velocities with given atom locality.");
- -
- -    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
- -    vReadyOnDevice_[atomLocality].markEvent(commandStream);
+ +    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
+ +    vReadyOnDevice_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
   }
   
   GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
@@@ -388,8 -401,13 +396,8 @@@
   
   void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = vCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying velocities with given atom locality.");
- -
- -    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
- -    vReadyOnHost_[atomLocality].markEvent(commandStream);
+ +    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
+ +    vReadyOnHost_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
   }
   
   void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
@@@ -406,8 -424,13 +414,8 @@@ DeviceBuffer<float> StatePropagatorData
   void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
                                                      AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = fCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying forces with given atom locality.");
- -
- -    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
- -    fReadyOnDevice_[atomLocality].markEvent(commandStream);
+ +    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
+ +    fReadyOnDevice_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
   }
   
   GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
@@@ -430,8 -453,13 +438,8 @@@ GpuEventSynchronizer* StatePropagatorDa
   
   void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
   {
- -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- -    CommandStream commandStream = fCopyStreams_[atomLocality];
- -    GMX_ASSERT(commandStream != nullptr,
- -               "No stream is valid for copying forces with given atom locality.");
- -
- -    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
- -    fReadyOnHost_[atomLocality].markEvent(commandStream);
+ +    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
+ +    fReadyOnHost_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
   }
   
   void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
author	Paul Bauer <paul.bauer.q@gmail.com>
	Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
		1	2
src/gromacs/mdlib/leapfrog_cuda.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/leapfrog_cuda.cuh	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/lincs_cuda.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/sim_util.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdrun/runner.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp	patch \|	diff1 \|	diff2 \|	blob \| history