Pad RVec force buffer in ThreadForceBuffer
[alexxy/gromacs.git] / src / gromacs / mdtypes / threaded_force_buffer.cpp
index 0aca081f2c4945690b8101e60b07b452d21a49ae..515d1934b70ffc779b3eba7ff947092c0df6268f 100644 (file)
@@ -109,7 +109,16 @@ void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const i
     const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
 
     reductionMask_.resize(numBlocks);
-    forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
+
+    constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real);
+    int              newNumElements           = numBlocks * s_reductionBlockSize;
+    if (c_numComponentsInElement != 4 && newNumElements == numAtoms)
+    {
+        // Pad with one element to allow 4-wide SIMD loads and stores.
+        // Note that actually only one real is needed, but we need a whole element for the ArrayRef.
+        newNumElements += 1;
+    }
+    forceBuffer_.resize(newNumElements * c_numComponentsInElement);
 
     for (gmx_bitmask_t& mask : reductionMask_)
     {
@@ -175,7 +184,8 @@ void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
             {
                 if (bitmask_is_set(masks[blockIndex], ft))
                 {
-                    fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
+                    fp[numContributingBuffers++] =
+                            threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data();
                 }
             }
             if (numContributingBuffers > 0)