Pad RVec force buffer in ThreadForceBuffer

[alexxy/gromacs.git] / src / gromacs / mdtypes / threaded_force_buffer.cpp
diff --git a/src/gromacs/mdtypes/threaded_force_buffer.cpp b/src/gromacs/mdtypes/threaded_force_buffer.cpp

index 0aca081f2c4945690b8101e60b07b452d21a49ae..515d1934b70ffc779b3eba7ff947092c0df6268f 100644 (file)
--- a/src/gromacs/mdtypes/threaded_force_buffer.cpp
+++ b/src/gromacs/mdtypes/threaded_force_buffer.cpp
@@ -109,7 +109,16 @@ void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const i
      const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
  
      reductionMask_.resize(numBlocks);
-    forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
+
+    constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real);
+    int              newNumElements           = numBlocks * s_reductionBlockSize;
+    if (c_numComponentsInElement != 4 && newNumElements == numAtoms)
+    {
+        // Pad with one element to allow 4-wide SIMD loads and stores.
+        // Note that actually only one real is needed, but we need a whole element for the ArrayRef.
+        newNumElements += 1;
+    }
+    forceBuffer_.resize(newNumElements * c_numComponentsInElement);
  
      for (gmx_bitmask_t& mask : reductionMask_)
      {
@@ -175,7 +184,8 @@ void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
              {
                  if (bitmask_is_set(masks[blockIndex], ft))
                  {
-                    fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
+                    fp[numContributingBuffers++] =
+                            threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data();
                  }
              }
              if (numContributingBuffers > 0)