Merge origin/release-2020 into master
authorPaul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
Resolved Conflicts:
src/gromacs/mdlib/update.cpp
src/gromacs/mdrun/md.cpp

Change-Id: Ibf1c632cc3096121dccd6c1c1af9bff3d590d18f

1  2 
src/gromacs/mdlib/leapfrog_cuda.cu
src/gromacs/mdlib/leapfrog_cuda.cuh
src/gromacs/mdlib/lincs_cuda.cu
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 18768c28f29fd59e7647c9a19757ebe34833ea9e,4415158d32ea7d051e16d327afa515c1b4b28934..ac4135960740aea0c45f4e8db8deb27856b2db45
@@@ -111,19 -111,19 +111,19 @@@ enum class VelocityScalingTyp
   *  \todo Check if the force should be set to zero here.
   *  \todo This kernel can also accumulate incidental temperatures for each atom.
   *
-  * \tparam        numTempScaleValues             The number of different T-couple values.
-  * \tparam        velocityScaling                Type of the Parrinello-Rahman velocity rescaling.
-  * \param[in]     numAtoms                       Total number of atoms.
-  * \param[in,out] gm_x                           Coordinates to update upon integration.
-  * \param[out]    gm_xp                          A copy of the coordinates before the integration (for constraints).
-  * \param[in,out] gm_v                           Velocities to update.
-  * \param[in]     gm_f                           Atomic forces.
-  * \param[in]     gm_inverseMasses               Reciprocal masses.
-  * \param[in]     dt                             Timestep.
-  * \param[in]     gm_lambdas                     Temperature scaling factors (one per group)
-  * \param[in]     gm_tempScaleGroups             Mapping of atoms into groups.
-  * \param[in]     dtPressureCouple               Time step for pressure coupling
-  * \param[in]     velocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
+  * \tparam        numTempScaleValues               The number of different T-couple values.
+  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
+  * \param[in]     numAtoms                         Total number of atoms.
+  * \param[in,out] gm_x                             Coordinates to update upon integration.
+  * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
+  * \param[in,out] gm_v                             Velocities to update.
+  * \param[in]     gm_f                             Atomic forces.
+  * \param[in]     gm_inverseMasses                 Reciprocal masses.
+  * \param[in]     dt                               Timestep.
+  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
+  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
+  * \param[in]     dtPressureCouple                 Time step for pressure coupling
+  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
   */
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  __launch_bounds__(c_maxThreadsPerBlock) __global__
                               const float dt,
                               const float* __restrict__ gm_lambdas,
                               const unsigned short* __restrict__ gm_tempScaleGroups,
-                              const float3 velocityScalingMatrixDiagonal);
+                              const float3 prVelocityScalingMatrixDiagonal);
  
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  __launch_bounds__(c_maxThreadsPerBlock) __global__
                               const float dt,
                               const float* __restrict__ gm_lambdas,
                               const unsigned short* __restrict__ gm_tempScaleGroups,
-                              const float3 velocityScalingMatrixDiagonal)
+                              const float3 prVelocityScalingMatrixDiagonal)
  {
      int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
      if (threadIndex < numAtoms)
  
              if (velocityScaling == VelocityScalingType::Diagonal)
              {
-                 vp.x -= velocityScalingMatrixDiagonal.x * v.x;
-                 vp.y -= velocityScalingMatrixDiagonal.y * v.y;
-                 vp.z -= velocityScalingMatrixDiagonal.z * v.z;
+                 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
+                 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
+                 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
              }
  
              v = vp;
  
  /*! \brief Select templated kernel.
   *
-  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups.
-  * If zero is passed as an argument, it is assumed that no temperature coupling groups are used.
+  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
+  * whether or not the temperature and(or) pressure coupling is enabled.
   *
-  * \param[in]  numTempScaleValues  Numer of temperature coupling groups in the system
-  * \param[in]  velocityScaling     Type of the Parrinello-Rahman velocity scaling
+  * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
+  *                                    should be selected.
+  * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
+  * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
   *
   * \retrun                         Pointer to CUDA kernel
   */
- inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType velocityScaling)
+ inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
+                                     int                 numTempScaleValues,
+                                     VelocityScalingType prVelocityScalingType)
  {
+     // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
+     GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
+                "Temperature coupling was requested with no temperature coupling groups.");
      auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
  
-     if (velocityScaling == VelocityScalingType::None)
+     if (prVelocityScalingType == VelocityScalingType::None)
      {
-         if (numTempScaleValues == 0)
+         if (!doTemperatureScaling)
          {
              kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
          }
          {
              kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
          }
-         else
-         {
-             GMX_RELEASE_ASSERT(false,
-                                "Number of temperature coupling groups should be greater than zero "
-                                "(zero for no coupling).");
-         }
      }
-     else if (velocityScaling == VelocityScalingType::Diagonal)
+     else if (prVelocityScalingType == VelocityScalingType::Diagonal)
      {
-         if (numTempScaleValues == 0)
+         if (!doTemperatureScaling)
          {
              kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
          }
          {
              kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
          }
-         else
-         {
-             GMX_RELEASE_ASSERT(false,
-                                "Number of temperature coupling groups should be greater than zero "
-                                "(zero for no coupling).");
-         }
      }
      else
      {
@@@ -272,19 -267,19 +267,19 @@@ void LeapFrogCuda::integrate(const floa
                               float3*                           d_v,
                               const float3*                     d_f,
                               const real                        dt,
-                              const bool                        doTempCouple,
+                              const bool                        doTemperatureScaling,
                               gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                              const bool                        doPressureCouple,
+                              const bool                        doParrinelloRahman,
                               const float                       dtPressureCouple,
-                              const matrix                      velocityScalingMatrix)
+                              const matrix                      prVelocityScalingMatrix)
  {
  
      ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
  
      auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
-     if (doTempCouple || doPressureCouple)
+     if (doTemperatureScaling || doParrinelloRahman)
      {
-         if (doTempCouple)
+         if (doTemperatureScaling)
          {
              GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
                         "Number of temperature scaling factors changed since it was set for the "
              copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
                                 commandStream_, GpuApiCallBehavior::Async, nullptr);
          }
-         VelocityScalingType velocityScaling = VelocityScalingType::None;
-         if (doPressureCouple)
+         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+         if (doParrinelloRahman)
          {
-             velocityScaling = VelocityScalingType::Diagonal;
-             GMX_ASSERT(velocityScalingMatrix[YY][XX] == 0 && velocityScalingMatrix[ZZ][XX] == 0
-                                && velocityScalingMatrix[ZZ][YY] == 0 && velocityScalingMatrix[XX][YY] == 0
-                                && velocityScalingMatrix[XX][ZZ] == 0 && velocityScalingMatrix[YY][ZZ] == 0,
+             prVelocityScalingType = VelocityScalingType::Diagonal;
+             GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                                && prVelocityScalingMatrix[ZZ][YY] == 0
+                                && prVelocityScalingMatrix[XX][YY] == 0
+                                && prVelocityScalingMatrix[XX][ZZ] == 0
+                                && prVelocityScalingMatrix[YY][ZZ] == 0,
                         "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
                         "in GPU version of Leap-Frog integrator.");
-             velocityScalingMatrixDiagonal_ =
-                     make_float3(dtPressureCouple * velocityScalingMatrix[XX][XX],
-                                 dtPressureCouple * velocityScalingMatrix[YY][YY],
-                                 dtPressureCouple * velocityScalingMatrix[ZZ][ZZ]);
+             prVelocityScalingMatrixDiagonal_ =
+                     make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                                 dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                                 dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
          }
-         kernelPtr = selectLeapFrogKernelPtr(numTempScaleValues_, velocityScaling);
+         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
      }
  
      const auto kernelArgs = prepareGpuKernelArguments(
              kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
-             &dt, &d_lambdas_, &d_tempScaleGroups_, &velocityScalingMatrixDiagonal_);
+             &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
      launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
  
      return;
@@@ -339,6 -336,11 +336,6 @@@ LeapFrogCuda::~LeapFrogCuda(
      freeDeviceBuffer(&d_inverseMasses_);
  }
  
 -void LeapFrogCuda::setPbc(const t_pbc* pbc)
 -{
 -    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
 -}
 -
  void LeapFrogCuda::set(const t_mdatoms& md, const int numTempScaleValues, const unsigned short* tempScaleGroups)
  {
      numAtoms_                       = md.nr;
index 60dffc983e3f64272ed3b96caded1f2bd58275ac,cb71267a209ff1619f4dc331a734fdf0ceb783b0..cbf3738a8f82e6270f4363b67b29acba725f4583
@@@ -69,32 -69,41 +69,32 @@@ public
      LeapFrogCuda(CommandStream commandStream);
      ~LeapFrogCuda();
  
 -    /*! \brief
 -     * Update PBC data.
 -     *
 -     * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
 -     *
 -     * \param[in] pbc The PBC data in t_pbc format.
 -     */
 -    void setPbc(const t_pbc* pbc);
 -
      /*! \brief Integrate
       *
       * Integrates the equation of motion using Leap-Frog algorithm.
       * Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
       *
-      * \param[in,out] d_x                    Coordinates to update
-      * \param[out]    d_xp                   Place to save the values of initial coordinates coordinates to.
-      * \param[in,out] d_v                    Velocities (will be updated).
-      * \param[in]     d_f                    Forces.
-      * \param[in]     dt                     Timestep.
-      * \param[in]     doTempCouple           If the temperature coupling should be applied.
-      * \param[in]     tcstat                 Temperature coupling data.
-      * \param[in]     doPressureCouple       If the temperature coupling should be applied.
-      * \param[in]     dtPressureCouple       Period between pressure coupling steps
-      * \param[in]     velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
+      * \param[in,out] d_x                      Coordinates to update
+      * \param[out]    d_xp                     Place to save the values of initial coordinates coordinates to.
+      * \param[in,out] d_v                      Velocities (will be updated).
+      * \param[in]     d_f                      Forces.
+      * \param[in]     dt                       Timestep.
+      * \param[in]     doTemperatureScaling     If velocities should be scaled for temperature coupling.
+      * \param[in]     tcstat                   Temperature coupling data.
+      * \param[in]     doParrinelloRahman       If current step is a Parrinello-Rahman pressure coupling step.
+      * \param[in]     dtPressureCouple         Period between pressure coupling steps
+      * \param[in]     prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
       */
      void integrate(const float3*                     d_x,
                     float3*                           d_xp,
                     float3*                           d_v,
                     const float3*                     d_f,
                     const real                        dt,
-                    const bool                        doTempCouple,
+                    const bool                        doTemperatureScaling,
                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                    const bool                        doPressureCouple,
+                    const bool                        doParrinelloRahman,
                     const float                       dtPressureCouple,
-                    const matrix                      velocityScalingMatrix);
+                    const matrix                      prVelocityScalingMatrix);
  
      /*! \brief Set the integrator
       *
@@@ -116,6 -125,8 +116,6 @@@ private
      CommandStream commandStream_;
      //! CUDA kernel launch config
      KernelLaunchConfig kernelLaunchConfig_;
 -    //! Periodic boundary data
 -    PbcAiuc pbcAiuc_;
      //! Number of atoms
      int numAtoms_;
  
      //! Maximum size of the temperature coupling groups array
      int numTempScaleGroupsAlloc_ = -1;
  
-     //! Vector with diagonal elements of the pressure coupling velocity rescale factors
-     float3 velocityScalingMatrixDiagonal_;
+     //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
+     float3 prVelocityScalingMatrixDiagonal_;
  };
  
  } // namespace gmx
index c0caa1278647d4d5eee65a18c47ada36447f5fcb,84325141ad5365b3e9a5200799835f4eac332a8f..ce1b442f84902e61797ac90035188e9309839be1
@@@ -40,6 -40,8 +40,6 @@@
   * using CUDA, including class initialization, data-structures management
   * and GPU kernel.
   *
 - * \note Management of periodic boundary should be unified with SETTLE and
 - *       removed from here.
   * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
   *
   * \author Artem Zhmurov <zhmurov@gmail.com>
@@@ -432,8 -434,7 +432,8 @@@ void LincsCuda::apply(const float3* d_x
                        float3*       d_v,
                        const real    invdt,
                        const bool    computeVirial,
 -                      tensor        virialScaled)
 +                      tensor        virialScaled,
 +                      const PbcAiuc pbcAiuc)
  {
      ensureNoPendingCudaError("In CUDA version of LINCS");
  
      }
      config.stream = commandStream_;
  
 +    kernelParams_.pbcAiuc = pbcAiuc;
 +
      const auto kernelArgs =
              prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
  
@@@ -549,41 -548,84 +549,84 @@@ LincsCuda::~LincsCuda(
  
  /*! \brief Helper function to go through constraints recursively.
   *
-  *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+  *  For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
   *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
-  *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
-  *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+  *  coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
+  *  value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
   *  after it in the constraints array.
   *
-  * \param[in]     a                  Atom index.
-  * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
-  *                                   the number of constraints (i) connected to it and (ii) located
-  *                                   after it in memory. This array is filled by this recursive function.
-  *                                   For a set of coupled constraints, only for the first one in this list
-  *                                   the number of consecutive coupled constraints is needed: if there is
-  *                                   not enough space for this set of constraints in the thread block,
-  *                                   the group has to be moved to the next one.
-  * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
+  * \param[in]     a                   Atom index.
+  * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
+  *                                    the number of constraints (i) connected to it and (ii) located
+  *                                    after it in memory. This array is filled by this recursive function.
+  *                                    For a set of coupled constraints, only for the first one in this list
+  *                                    the number of consecutive coupled constraints is needed: if there is
+  *                                    not enough space for this set of constraints in the thread block,
+  *                                    the group has to be moved to the next one.
+  * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
   */
- inline int countCoupled(int                                                  a,
-                         std::vector<int>*                                    spaceNeeded,
-                         std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
+ inline int countCoupled(int           a,
+                         ArrayRef<int> numCoupledConstraints,
+                         ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
  
  {
-     int c2, a2, sign;
      int counted = 0;
-     for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+     for (const auto& adjacentAtom : atomsAdjacencyList[a])
      {
-         std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
-         if (spaceNeeded->at(c2) == -1)
+         const int c2 = std::get<1>(adjacentAtom);
+         if (numCoupledConstraints[c2] == -1)
          {
-             spaceNeeded->at(c2) = 0; // To indicate we've been here
-             counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+             numCoupledConstraints[c2] = 0; // To indicate we've been here
+             counted += 1 + countCoupled(std::get<0>(adjacentAtom), numCoupledConstraints, atomsAdjacencyList);
          }
      }
      return counted;
  }
  
+ /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
+  *
+  *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
+  *  if it was not yet added. Then goes through all the constraints coupled to \p c
+  *  and calls itself recursively. This ensures that all the coupled constraints will
+  *  be added to neighboring locations in the final data structures on the device,
+  *  hence mapping all coupled constraints to the same thread block. A value of -1 in
+  *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
+  *
+  * \param[in]     iatoms              The list of constraints.
+  * \param[in]     stride              Number of elements per constraint in \p iatoms.
+  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
+  * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
+  * \param[in]     c                   Sequential index for constraint to consider adding.
+  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
+  */
+ inline void addWithCoupled(const t_iatom*                                         iatoms,
+                            const int                                              stride,
+                            ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
+                            ArrayRef<int>                                          splitMap,
+                            const int                                              c,
+                            int*                                                   currentMapIndex)
+ {
+     if (splitMap[c] == -1)
+     {
+         splitMap[c] = *currentMapIndex;
+         (*currentMapIndex)++;
+         // Constraints, coupled through both atoms.
+         for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
+         {
+             const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
+             for (const auto& adjacentAtom : atomsAdjacencyList[a1])
+             {
+                 const int c2 = std::get<1>(adjacentAtom);
+                 if (c2 != c)
+                 {
+                     addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
+                 }
+             }
+         }
+     }
+ }
  void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
  {
      int numAtoms = md.nr;
          atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
      }
  
-     // Compute, how many coupled constraints are in front of each constraint.
-     // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
-     // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
-     // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
-     // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
-     std::vector<int> spaceNeeded;
-     spaceNeeded.resize(numConstraints, -1);
-     std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+     // Compute, how many constraints are coupled to each constraint.
+     // Needed to introduce splits in data so that all coupled constraints will be computed in a
+     // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
+     // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
+     // first index of the connected group of the constraints is needed later in the code, hence the
+     // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+     std::vector<int> numCoupledConstraints(numConstraints, -1);
      for (int c = 0; c < numConstraints; c++)
      {
          int a1 = iatoms[stride * c + 1];
          int a2 = iatoms[stride * c + 2];
-         if (spaceNeeded.at(c) == -1)
+         if (numCoupledConstraints.at(c) == -1)
          {
-             spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
-                                 + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+             numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+                                           + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
          }
      }
  
      // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
      // takes into account the empty spaces which might be needed in the end of each thread block.
-     std::vector<int> splitMap;
-     splitMap.resize(numConstraints, -1);
-     int currentMapIndex = 0;
+     std::vector<int> splitMap(numConstraints, -1);
+     int              currentMapIndex = 0;
      for (int c = 0; c < numConstraints; c++)
      {
          // Check if coupled constraints all fit in one block
-         GMX_RELEASE_ASSERT(
-                 spaceNeeded.at(c) < c_threadsPerBlock,
-                 "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
-                 "Most likely, you are trying to use GPU version of LINCS with constraints on "
-                 "all-bonds, "
-                 "which is not supported. Try using H-bonds constraints only.");
-         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+         if (numCoupledConstraints.at(c) > c_threadsPerBlock)
+         {
+             gmx_fatal(FARGS,
+                       "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
+                       "thread block (%d). Most likely, you are trying to use the GPU version of "
+                       "LINCS with constraints on all-bonds, which is not supported for large "
+                       "molecules. When compatible with the force field and integration settings, "
+                       "using constraints on H-bonds only.",
+                       numCoupledConstraints.at(c), c_threadsPerBlock);
+         }
+         if (currentMapIndex / c_threadsPerBlock
+             != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
          {
              currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
          }
-         splitMap.at(c) = currentMapIndex;
-         currentMapIndex++;
+         addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
      }
      kernelParams_.numConstraintsThreads =
              currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
      GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
                         maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  
-     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
      copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  }
  
 -void LincsCuda::setPbc(const t_pbc* pbc)
 -{
 -    setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
 -}
 -
  } // namespace gmx
index aa2c7dbadf658d315fe13a2ac4a33c9a5261dacd,c6adc7b73c3a545bca023176f0c268c112a11ec8..692c2570b66f9dfa613e29dae78d4f183685027f
@@@ -843,7 -843,7 +843,7 @@@ static StepWorkload setupStepWorkload(c
  
  /* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
   *
 - * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
 + * TODO: eliminate \p useGpuPmeOnThisRank when this is
   * incorporated in DomainLifetimeWorkload.
   */
  static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                      gmx_pme_t*                        pmedata,
                                      gmx_enerdata_t*                   enerd,
                                      const gmx::MdrunScheduleWorkload& runScheduleWork,
 -                                    bool                              useGpuNonbonded,
 -                                    bool                              useGpuPme,
 +                                    bool                              useGpuPmeOnThisRank,
                                      int64_t                           step,
                                      gmx_wallcycle_t                   wcycle)
  {
 -    if (useGpuNonbonded)
 +    if (runScheduleWork.simulationWork.useGpuNonbonded)
      {
          /* Launch pruning before buffer clearing because the API overhead of the
           * clear kernel launches can leave the GPU idle while it could be running
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
      }
  
 -    if (useGpuPme)
 +    if (useGpuPmeOnThisRank)
      {
          pme_gpu_reinit_computation(pmedata, wcycle);
      }
@@@ -1031,10 -1032,11 +1031,11 @@@ void do_force(FILE
          }
      }
  
-     // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
-     // current coordinates are already on the host, hence copy is not needed.
+     // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
+     // the CPU, or for the computation of virial. At search steps the current coordinates are
+     // already on the host, hence copy is not needed.
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-         && runScheduleWork->domainWork.haveCpuLocalForceWork)
+         && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
          stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
              }
              wallcycle_stop(wcycle, ewcLAUNCH_GPU);
          }
 -    }
  
 -    if (stepWork.doNeighborSearch)
 -    {
          // Need to run after the GPU-offload bonded interaction lists
          // are set up to be able to determine whether there is bonded work.
          runScheduleWork->domainWork = setupDomainLifetimeWorkload(
      if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
      {
          /* Wait for non-local coordinate data to be copied from device */
-         nbv->wait_nonlocal_x_copy_D2H_done();
+         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
      }
      /* Compute the bonded and non-bonded energies and optionally forces */
      do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
                                             fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
                          : nullptr; // PME reduction not active on GPU
  
-         gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+         gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
  
          if (stepWork.useGpuPmeFReduction)
          {
              }
              if (useGpuForcesHaloExchange)
              {
-                 // Add a stream synchronization to satisfy a dependency
-                 // for the local buffer ops on the result of GPU halo
-                 // exchange, which operates in the non-local stream and
-                 // writes to to local parf og the force buffer.
-                 //
-                 // TODO improve this through use of an event - see Redmine #3093
-                 //      push the event into the dependencyList
-                 nbv->stream_local_wait_for_nonlocal();
+                 dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
      }
  
      launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
 -                            simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
 +                            useGpuPmeOnThisRank, step, wcycle);
  
      if (DOMAINDECOMP(cr))
      {
index 3f4109db5401f7cb8e32ea599baf5e46c01be879,add995f26a0322b281321bf1608557f319e417a4..7f38b8d74cc74a1b9917dea8f23c2d17b13b9f33
  
  namespace gmx
  {
+ /*!\brief Number of CUDA threads in a block
+  *
+  * \todo Check if using smaller block size will lead to better prformance.
+  */
+ constexpr static int c_threadsPerBlock = 256;
+ //! Maximum number of threads in a block (for __launch_bounds__)
+ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+ /*! \brief Scaling matrix struct.
+  *
+  * \todo Should be generalized.
+  */
+ struct ScalingMatrix
+ {
+     float xx, yy, zz, yx, zx, zy;
+ };
+ __launch_bounds__(c_maxThreadsPerBlock) __global__
+         static void scaleCoordinates_kernel(const int numAtoms,
+                                             float3* __restrict__ gm_x,
+                                             const ScalingMatrix scalingMatrix)
+ {
+     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+     if (threadIndex < numAtoms)
+     {
+         float3 x = gm_x[threadIndex];
+         x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+         x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+         x.z = scalingMatrix.zz * x.z;
+         gm_x[threadIndex] = x;
+     }
+ }
  
  void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
                                            const real                        dt,
                                            const bool                        updateVelocities,
                                            const bool                        computeVirial,
                                            tensor                            virial,
-                                           const bool                        doTempCouple,
+                                           const bool                        doTemperatureScaling,
                                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                           const bool                        doPressureCouple,
+                                           const bool                        doParrinelloRahman,
                                            const float                       dtPressureCouple,
-                                           const matrix                      velocityScalingMatrix)
+                                           const matrix                      prVelocityScalingMatrix)
  {
      // Clearing virial matrix
      // TODO There is no point in having separate virial matrix for constraints
  
      // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
      // into d_x_. The d_xp_ is only needed by constraints.
-     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTempCouple, tcstat, doPressureCouple,
-                            dtPressureCouple, velocityScalingMatrix);
+     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
+                            doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
      // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
      // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
      // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
 -    lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
 -    settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
 +    lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
 +    settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
  
      // scaledVirial -> virial (methods above returns scaled values)
      float scaleFactor = 0.5f / (dt * dt);
      return;
  }
  
+ void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+ {
+     ScalingMatrix mu;
+     mu.xx = scalingMatrix[XX][XX];
+     mu.yy = scalingMatrix[YY][YY];
+     mu.zz = scalingMatrix[ZZ][ZZ];
+     mu.yx = scalingMatrix[YY][XX];
+     mu.zx = scalingMatrix[ZZ][XX];
+     mu.zy = scalingMatrix[ZZ][YY];
+     const auto kernelArgs = prepareGpuKernelArguments(
+             scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+     launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+                     "scaleCoordinates_kernel", kernelArgs);
+     // TODO: Although this only happens on the pressure coupling steps, this synchronization
+     //       can affect the perfornamce if nstpcouple is small.
+     gpuStreamSynchronize(commandStream_);
+ }
  UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
                                  const gmx_mtop_t&     mtop,
                                  const void*           commandStream,
      integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
      lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
      settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+     coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
+     coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
+     coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
+     coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+     coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
  }
  
  UpdateConstrainCuda::Impl::~Impl() {}
@@@ -155,11 -214,17 +214,14 @@@ void UpdateConstrainCuda::Impl::set(Dev
      integrator_->set(md, numTempScaleValues, md.cTC);
      lincsCuda_->set(idef, md);
      settleCuda_->set(idef, md);
+     coordinateScalingKernelLaunchConfig_.gridSize[0] =
+             (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
  }
  
  void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
  {
      setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
 -    integrator_->setPbc(pbc);
 -    lincsCuda_->setPbc(pbc);
 -    settleCuda_->setPbc(pbc);
  }
  
  GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
@@@ -182,14 -247,19 +244,19 @@@ void UpdateConstrainCuda::integrate(Gpu
                                      const bool                        updateVelocities,
                                      const bool                        computeVirial,
                                      tensor                            virialScaled,
-                                     const bool                        doTempCouple,
+                                     const bool                        doTemperatureScaling,
                                      gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                     const bool                        doPressureCouple,
+                                     const bool                        doParrinelloRahman,
                                      const float                       dtPressureCouple,
-                                     const matrix                      velocityScalingMatrix)
+                                     const matrix                      prVelocityScalingMatrix)
+ {
+     impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTemperatureScaling,
+                      tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+ }
+ void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
  {
-     impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTempCouple,
-                      tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
+     impl_->scaleCoordinates(scalingMatrix);
  }
  
  void UpdateConstrainCuda::set(DeviceBuffer<float>       d_x,
index 5fb17e91995d3ec9951f9443102341be6128be1e,d68a5fec5414746cd2e373289dfcfc7db653f9f3..75a6f6d14cb18b3607a68c2f2cec1135ea7fbe6d
@@@ -196,12 -196,12 +196,12 @@@ struct DevelopmentFeatureFlag
   *
   * \param[in]  mdlog                Logger object.
   * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
-  * \param[in]  useGpuForPme         True if the PME task is offloaded in this run.
+  * \param[in]  pmeRunMode           The PME run mode for this run
   * \returns                         The object populated with development feature flags.
   */
  static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
                                                           const bool           useGpuForNonbonded,
-                                                          const bool           useGpuForPme)
+                                                          const PmeRunMode     pmeRunMode)
  {
      DevelopmentFeatureFlags devFlags;
  
  
      if (devFlags.enableGpuPmePPComm)
      {
-         if (useGpuForPme)
+         if (pmeRunMode == PmeRunMode::GPU)
          {
              GMX_LOG(mdlog.warning)
                      .asParagraph()
          }
          else
          {
+             std::string clarification;
+             if (pmeRunMode == PmeRunMode::Mixed)
+             {
+                 clarification =
+                         "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
+                         "mode).";
+             }
+             else
+             {
+                 clarification = "PME is not offloaded to the GPU.";
+             }
              GMX_LOG(mdlog.warning)
                      .asParagraph()
-                     .appendTextFormatted(
+                     .appendText(
                              "NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
-                             "'GPU PME-PP communications' feature was not enabled as PME is not "
-                             "offloaded to the GPU.");
+                             "'GPU PME-PP communications' feature was not enabled as "
+                             + clarification);
              devFlags.enableGpuPmePPComm = false;
          }
      }
@@@ -889,7 -900,7 +900,7 @@@ int Mdrunner::mdrunner(
      // Initialize development feature flags that enabled by environment variable
      // and report those features that are enabled.
      const DevelopmentFeatureFlags devFlags =
-             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
+             manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
  
      // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
      //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
              /* This call is not included in init_domain_decomposition mainly
               * because fr->cginfo_mb is set later.
               */
 -            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
 +            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
                              domdecOptions.checkBondedInteractions, fr->cginfo_mb);
          }
  
      }
  
      // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
 -    // before we destroy the GPU context(s) in free_gpu_resources().
 +    // before we destroy the GPU context(s) in free_gpu().
      // Pinned buffers are associated with contexts in CUDA.
      // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
      mdAtoms.reset(nullptr);
      globalState.reset(nullptr);
      mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
 +    /* Free pinned buffers in *fr */
 +    delete fr;
 +    fr = nullptr;
 +
 +    if (hwinfo->gpu_info.n_dev > 0)
 +    {
 +        /* stop the GPU profiler (only CUDA) */
 +        stopGpuProfiler();
 +    }
 +
 +    /* With tMPI we need to wait for all ranks to finish deallocation before
 +     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
 +     * GPU and context.
 +     *
 +     * This is not a concern in OpenCL where we use one context per rank which
 +     * is freed in nbnxn_gpu_free().
 +     *
 +     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
 +     * but it is easier and more futureproof to call it on the whole node.
 +     *
 +     * Note that this function needs to be called even if GPUs are not used
 +     * in this run because the PME ranks have no knowledge of whether GPUs
 +     * are used or not, but all ranks need to enter the barrier below.
 +     * \todo Remove this physical node barrier after making sure
 +     * that it's not needed anymore (with a shared GPU run).
 +     */
 +    if (GMX_THREAD_MPI)
 +    {
 +        physicalNodeComm.barrier();
 +    }
  
 -    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
 -    free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
      free_gpu(nonbondedDeviceInfo);
      free_gpu(pmeDeviceInfo);
 -    done_forcerec(fr, mtop.molblock.size());
      sfree(fcd);
  
      if (doMembed)
index d68e3e77b933d15bfbe2000017a0892f1fbdcd54,f42ad7230e9b220eaca5be3cae2ebb401fa8390f..9d674afd8f5d05c8abdbadeab75010efe7c032ff
@@@ -194,7 -194,15 +194,15 @@@ void StatePropagatorDataGpu::Impl::rein
      }
  
      reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+     const int d_fOldCapacity = d_fCapacity_;
      reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+     // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+     // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+     // since the force buffer ops are not implemented in OpenCL.
+     if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+     {
+         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+     }
  }
  
  std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
@@@ -238,13 -246,8 +246,13 @@@ void StatePropagatorDataGpu::Impl::copy
  
      GMX_UNUSED_VALUE(dataSize);
  
 +    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 +
      GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
  
 +    GMX_ASSERT(commandStream != nullptr,
 +               "No stream is valid for copying with given atom locality.");
 +
      int atomsStartAt, numAtomsToCopy;
      std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
  
@@@ -272,13 -275,8 +280,13 @@@ void StatePropagatorDataGpu::Impl::copy
  
      GMX_UNUSED_VALUE(dataSize);
  
 +    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 +
      GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
  
 +    GMX_ASSERT(commandStream != nullptr,
 +               "No stream is valid for copying with given atom locality.");
 +
      int atomsStartAt, numAtomsToCopy;
      std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
  
@@@ -305,7 -303,12 +313,7 @@@ DeviceBuffer<float> StatePropagatorData
  void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
                                                          AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = xCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying positions with given atom locality.");
 -
 -    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
 +    copyToDevice(d_x_, h_x, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
  
      // markEvent is skipped in OpenCL as:
      //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
      // TODO: remove this by adding an event-mark free flavor of this function
      if (GMX_GPU == GMX_GPU_CUDA)
      {
 -        xReadyOnDevice_[atomLocality].markEvent(commandStream);
 +        xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
      }
  }
  
@@@ -357,9 -360,14 +365,9 @@@ GpuEventSynchronizer* StatePropagatorDa
  
  void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = xCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying positions with given atom locality.");
 -
 -    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
 +    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
      // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
 -    xReadyOnHost_[atomLocality].markEvent(commandStream);
 +    xReadyOnHost_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
  }
  
  void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
@@@ -376,8 -384,13 +384,8 @@@ DeviceBuffer<float> StatePropagatorData
  void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
                                                         AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = vCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying velocities with given atom locality.");
 -
 -    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
 -    vReadyOnDevice_[atomLocality].markEvent(commandStream);
 +    copyToDevice(d_v_, h_v, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
 +    vReadyOnDevice_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
  }
  
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
  
  void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = vCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying velocities with given atom locality.");
 -
 -    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
 -    vReadyOnHost_[atomLocality].markEvent(commandStream);
 +    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
 +    vReadyOnHost_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
  }
  
  void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
@@@ -406,8 -424,13 +414,8 @@@ DeviceBuffer<float> StatePropagatorData
  void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
                                                     AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = fCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying forces with given atom locality.");
 -
 -    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
 -    fReadyOnDevice_[atomLocality].markEvent(commandStream);
 +    copyToDevice(d_f_, h_f, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
 +    fReadyOnDevice_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
  }
  
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
@@@ -430,8 -453,13 +438,8 @@@ GpuEventSynchronizer* StatePropagatorDa
  
  void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
  {
 -    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
 -    CommandStream commandStream = fCopyStreams_[atomLocality];
 -    GMX_ASSERT(commandStream != nullptr,
 -               "No stream is valid for copying forces with given atom locality.");
 -
 -    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
 -    fReadyOnHost_[atomLocality].markEvent(commandStream);
 +    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
 +    fReadyOnHost_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
  }
  
  void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)