int flags,
gmx::ArrayRef<const real> lambda,
t_nrnb* gmx_restrict nrnb,
- gmx::RVec* threadForceBuffer,
- rvec* threadForceShiftBuffer,
- gmx::ArrayRef<real> threadVc,
- gmx::ArrayRef<real> threadVv,
- gmx::ArrayRef<real> threadDvdl)
+ gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
+ rvec* threadForceShiftBuffer,
+ gmx::ArrayRef<real> threadVc,
+ gmx::ArrayRef<real> threadVv,
+ gmx::ArrayRef<real> threadDvdl)
{
#define STATE_A 0
#define STATE_B 1
dlFacVdw[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
}
- // TODO: We should get rid of using pointers to real
- const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
+ // We need pointers to real for SIMD access
+ const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
+ real* gmx_restrict forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
const real rlistSquared = gmx::square(rlist);
fIY = fIY + tY;
fIZ = fIZ + tZ;
- gmx::transposeScatterDecrU<3>(
- reinterpret_cast<real*>(threadForceBuffer), preloadJnr, tX, tY, tZ);
+ gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
}
} // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
{
if (doForces)
{
- gmx::transposeScatterIncrU<3>(
- reinterpret_cast<real*>(threadForceBuffer), preloadIi, fIX, fIY, fIZ);
+ gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
}
if (doShiftForces)
{
int flags,
gmx::ArrayRef<const real> lambda,
t_nrnb* gmx_restrict nrnb,
- gmx::RVec* threadForceBuffer,
+ gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
rvec* threadForceShiftBuffer,
gmx::ArrayRef<real> threadVc,
gmx::ArrayRef<real> threadVv,
int flags,
gmx::ArrayRef<const real> lambda,
t_nrnb* nrnb,
- gmx::RVec* threadForceBuffer,
+ gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
rvec* threadForceShiftBuffer,
gmx::ArrayRef<real> threadVc,
gmx::ArrayRef<real> threadVv,
"Unsupported eeltype with free energy");
GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
+ // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
+ GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
+ || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
+ "We need actual padding with at least one element for SIMD scatter operations");
+
const auto& scParams = *ic.softCoreParameters;
const bool vdwInteractionTypeIsEwald = (EVDW_PME(ic.vdwtype));
const bool elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
int flags,
gmx::ArrayRef<const real> lambda,
t_nrnb* gmx_restrict nrnb,
- gmx::RVec* threadForceBuffer,
+ gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
rvec* threadForceShiftBuffer,
gmx::ArrayRef<real> threadVc,
gmx::ArrayRef<real> threadVv,
doNBFlags,
lambdas,
&nrnb,
- output.f.arrayRefWithPadding().paddedArrayRef().data(),
+ output.f.arrayRefWithPadding(),
as_rvec_array(output.fShift.data()),
output.energy.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
output.energy.energyGroupPairTerms[NonBondedEnergyTerms::LJSR],
threadBuffer.clearForcesAndEnergies();
- rvec4* ft = threadBuffer.forceBuffer();
+ rvec4* ft = threadBuffer.forceBuffer().data();
/* Thread 0 writes directly to the main output buffers.
* We might want to reconsider this.
const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
reductionMask_.resize(numBlocks);
- forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
+
+ constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real);
+ int newNumElements = numBlocks * s_reductionBlockSize;
+ if (c_numComponentsInElement != 4 && newNumElements == numAtoms)
+ {
+ // Pad with one element to allow 4-wide SIMD loads and stores.
+ // Note that actually only one real is needed, but we need a whole element for the ArrayRef.
+ newNumElements += 1;
+ }
+ forceBuffer_.resize(newNumElements * c_numComponentsInElement);
for (gmx_bitmask_t& mask : reductionMask_)
{
{
if (bitmask_is_set(masks[blockIndex], ft))
{
- fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
+ fp[numContributingBuffers++] =
+ threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data();
}
}
if (numContributingBuffers > 0)
#include <memory>
+#include "gromacs/math/arrayrefwithpadding.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/simulation_workload.h"
//! Clears all force and energy buffers
void clearForcesAndEnergies();
- //! Returns a plain pointer to the force buffer
- ForceBufferElementType* forceBuffer()
+ //! Returns an array reference to the force buffer which is aligned for SIMD access
+ ArrayRef<ForceBufferElementType> forceBuffer()
{
- return reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data());
+ return ArrayRef<ForceBufferElementType>(
+ reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
+ reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_);
+ }
+
+ /*! \brief Returns an array reference with padding to the force buffer which is aligned for SIMD access
+ *
+ * For RVec there is padding of one real for 4-wide SIMD access.
+ * For both RVec and rvec4 there is padding up to the block size for use in ThreadedForceBuffer.
+ */
+ ArrayRefWithPadding<ForceBufferElementType> forceBufferWithPadding()
+ {
+ return ArrayRefWithPadding<ForceBufferElementType>(
+ reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
+ reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_,
+ reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data() + forceBuffer_.size()));
}
//! Returns a view of the shift force buffer
ArrayRef<const gmx_bitmask_t> reductionMask() const { return reductionMask_; }
private:
- //! Force array buffer
+ //! Force array buffer, aligned to enable aligned SIMD access
std::vector<real, AlignedAllocator<real>> forceBuffer_;
- //! Mask for marking which parts of f are filled, working array for constructing mask in bonded_threading_t
+ //! Mask for marking which parts of f are filled, working array for constructing mask in setupReduction()
std::vector<gmx_bitmask_t> reductionMask_;
//! Index to touched blocks
std::vector<int> usedBlockIndices_;
threadForceBuffer.clearForcesAndEnergies();
}
- gmx::RVec* threadForces = threadForceBuffer.forceBuffer();
+ auto threadForces = threadForceBuffer.forceBufferWithPadding();
rvec* threadForceShiftBuffer = as_rvec_array(threadForceBuffer.shiftForces().data());
gmx::ArrayRef<real> threadVc =
threadForceBuffer.groupPairEnergies().energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR];
kernelFlags,
lam_i,
nrnb,
- nullptr,
+ gmx::ArrayRefWithPadding<gmx::RVec>(),
nullptr,
threadVc,
threadVv,