From b7d298f95f97d387d8ba9567f2247bf394bcbd70 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Thu, 30 Sep 2021 13:40:53 +0000 Subject: [PATCH] Pad RVec force buffer in ThreadForceBuffer --- .../gmxlib/nonbonded/nb_free_energy.cpp | 30 +++++++++++-------- src/gromacs/gmxlib/nonbonded/nb_free_energy.h | 2 +- .../gmxlib/nonbonded/tests/nb_free_energy.cpp | 2 +- src/gromacs/listed_forces/listed_forces.cpp | 2 +- src/gromacs/mdtypes/threaded_force_buffer.cpp | 14 +++++++-- src/gromacs/mdtypes/threaded_force_buffer.h | 26 ++++++++++++---- src/gromacs/nbnxm/freeenergydispatch.cpp | 4 +-- 7 files changed, 55 insertions(+), 25 deletions(-) diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp index ad316bd5da..dbb17010c2 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp @@ -248,11 +248,11 @@ static void nb_free_energy_kernel(const t_nblist& int flags, gmx::ArrayRef lambda, t_nrnb* gmx_restrict nrnb, - gmx::RVec* threadForceBuffer, - rvec* threadForceShiftBuffer, - gmx::ArrayRef threadVc, - gmx::ArrayRef threadVv, - gmx::ArrayRef threadDvdl) + gmx::ArrayRefWithPadding threadForceBuffer, + rvec* threadForceShiftBuffer, + gmx::ArrayRef threadVc, + gmx::ArrayRef threadVv, + gmx::ArrayRef threadDvdl) { #define STATE_A 0 #define STATE_B 1 @@ -391,8 +391,9 @@ static void nb_free_energy_kernel(const t_nblist& dlFacVdw[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1); } - // TODO: We should get rid of using pointers to real - const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0]; + // We need pointers to real for SIMD access + const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0]; + real* gmx_restrict forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0]; const real rlistSquared = gmx::square(rlist); @@ -914,8 +915,7 @@ static void nb_free_energy_kernel(const t_nblist& fIY = fIY + tY; fIZ = fIZ + tZ; - gmx::transposeScatterDecrU<3>( - reinterpret_cast(threadForceBuffer), preloadJnr, tX, tY, tZ); + gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ); } } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth) @@ -923,8 +923,7 @@ static void nb_free_energy_kernel(const t_nblist& { if (doForces) { - gmx::transposeScatterIncrU<3>( - reinterpret_cast(threadForceBuffer), preloadIi, fIX, fIY, fIZ); + gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ); } if (doShiftForces) { @@ -985,7 +984,7 @@ typedef void (*KernelFunction)(const t_nblist& int flags, gmx::ArrayRef lambda, t_nrnb* gmx_restrict nrnb, - gmx::RVec* threadForceBuffer, + gmx::ArrayRefWithPadding threadForceBuffer, rvec* threadForceShiftBuffer, gmx::ArrayRef threadVc, gmx::ArrayRef threadVv, @@ -1119,7 +1118,7 @@ void gmx_nb_free_energy_kernel(const t_nblist& int flags, gmx::ArrayRef lambda, t_nrnb* nrnb, - gmx::RVec* threadForceBuffer, + gmx::ArrayRefWithPadding threadForceBuffer, rvec* threadForceShiftBuffer, gmx::ArrayRef threadVc, gmx::ArrayRef threadVv, @@ -1129,6 +1128,11 @@ void gmx_nb_free_energy_kernel(const t_nblist& "Unsupported eeltype with free energy"); GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters"); + // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert + GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty() + || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(), + "We need actual padding with at least one element for SIMD scatter operations"); + const auto& scParams = *ic.softCoreParameters; const bool vdwInteractionTypeIsEwald = (EVDW_PME(ic.vdwtype)); const bool elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype)); diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h index e942afbf1b..fb97c67bfa 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h @@ -69,7 +69,7 @@ void gmx_nb_free_energy_kernel(const t_nblist& int flags, gmx::ArrayRef lambda, t_nrnb* gmx_restrict nrnb, - gmx::RVec* threadForceBuffer, + gmx::ArrayRefWithPadding threadForceBuffer, rvec* threadForceShiftBuffer, gmx::ArrayRef threadVc, gmx::ArrayRef threadVv, diff --git a/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp index d25b98c1dd..a4414ee716 100644 --- a/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp +++ b/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp @@ -469,7 +469,7 @@ protected: doNBFlags, lambdas, &nrnb, - output.f.arrayRefWithPadding().paddedArrayRef().data(), + output.f.arrayRefWithPadding(), as_rvec_array(output.fShift.data()), output.energy.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR], output.energy.energyGroupPairTerms[NonBondedEnergyTerms::LJSR], diff --git a/src/gromacs/listed_forces/listed_forces.cpp b/src/gromacs/listed_forces/listed_forces.cpp index 25900e838d..7a90343212 100644 --- a/src/gromacs/listed_forces/listed_forces.cpp +++ b/src/gromacs/listed_forces/listed_forces.cpp @@ -398,7 +398,7 @@ static void calcBondedForces(const InteractionDefinitions& idef, threadBuffer.clearForcesAndEnergies(); - rvec4* ft = threadBuffer.forceBuffer(); + rvec4* ft = threadBuffer.forceBuffer().data(); /* Thread 0 writes directly to the main output buffers. * We might want to reconsider this. diff --git a/src/gromacs/mdtypes/threaded_force_buffer.cpp b/src/gromacs/mdtypes/threaded_force_buffer.cpp index 0aca081f2c..515d1934b7 100644 --- a/src/gromacs/mdtypes/threaded_force_buffer.cpp +++ b/src/gromacs/mdtypes/threaded_force_buffer.cpp @@ -109,7 +109,16 @@ void ThreadForceBuffer::resizeBufferAndClearMask(const i const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits; reductionMask_.resize(numBlocks); - forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real)); + + constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real); + int newNumElements = numBlocks * s_reductionBlockSize; + if (c_numComponentsInElement != 4 && newNumElements == numAtoms) + { + // Pad with one element to allow 4-wide SIMD loads and stores. + // Note that actually only one real is needed, but we need a whole element for the ArrayRef. + newNumElements += 1; + } + forceBuffer_.resize(newNumElements * c_numComponentsInElement); for (gmx_bitmask_t& mask : reductionMask_) { @@ -175,7 +184,8 @@ void reduceThreadForceBuffers(ArrayRef force, { if (bitmask_is_set(masks[blockIndex], ft)) { - fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer(); + fp[numContributingBuffers++] = + threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data(); } } if (numContributingBuffers > 0) diff --git a/src/gromacs/mdtypes/threaded_force_buffer.h b/src/gromacs/mdtypes/threaded_force_buffer.h index 6eeae6008f..d6b948c88a 100644 --- a/src/gromacs/mdtypes/threaded_force_buffer.h +++ b/src/gromacs/mdtypes/threaded_force_buffer.h @@ -61,6 +61,7 @@ #include +#include "gromacs/math/arrayrefwithpadding.h" #include "gromacs/math/vectypes.h" #include "gromacs/mdtypes/enerdata.h" #include "gromacs/mdtypes/simulation_workload.h" @@ -118,10 +119,25 @@ public: //! Clears all force and energy buffers void clearForcesAndEnergies(); - //! Returns a plain pointer to the force buffer - ForceBufferElementType* forceBuffer() + //! Returns an array reference to the force buffer which is aligned for SIMD access + ArrayRef forceBuffer() { - return reinterpret_cast(forceBuffer_.data()); + return ArrayRef( + reinterpret_cast(forceBuffer_.data()), + reinterpret_cast(forceBuffer_.data()) + numAtoms_); + } + + /*! \brief Returns an array reference with padding to the force buffer which is aligned for SIMD access + * + * For RVec there is padding of one real for 4-wide SIMD access. + * For both RVec and rvec4 there is padding up to the block size for use in ThreadedForceBuffer. + */ + ArrayRefWithPadding forceBufferWithPadding() + { + return ArrayRefWithPadding( + reinterpret_cast(forceBuffer_.data()), + reinterpret_cast(forceBuffer_.data()) + numAtoms_, + reinterpret_cast(forceBuffer_.data() + forceBuffer_.size())); } //! Returns a view of the shift force buffer @@ -140,9 +156,9 @@ public: ArrayRef reductionMask() const { return reductionMask_; } private: - //! Force array buffer + //! Force array buffer, aligned to enable aligned SIMD access std::vector> forceBuffer_; - //! Mask for marking which parts of f are filled, working array for constructing mask in bonded_threading_t + //! Mask for marking which parts of f are filled, working array for constructing mask in setupReduction() std::vector reductionMask_; //! Index to touched blocks std::vector usedBlockIndices_; diff --git a/src/gromacs/nbnxm/freeenergydispatch.cpp b/src/gromacs/nbnxm/freeenergydispatch.cpp index 826bd9ff61..f5c0667bcf 100644 --- a/src/gromacs/nbnxm/freeenergydispatch.cpp +++ b/src/gromacs/nbnxm/freeenergydispatch.cpp @@ -191,7 +191,7 @@ void dispatchFreeEnergyKernel(gmx::ArrayRef> n threadForceBuffer.clearForcesAndEnergies(); } - gmx::RVec* threadForces = threadForceBuffer.forceBuffer(); + auto threadForces = threadForceBuffer.forceBufferWithPadding(); rvec* threadForceShiftBuffer = as_rvec_array(threadForceBuffer.shiftForces().data()); gmx::ArrayRef threadVc = threadForceBuffer.groupPairEnergies().energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR]; @@ -281,7 +281,7 @@ void dispatchFreeEnergyKernel(gmx::ArrayRef> n kernelFlags, lam_i, nrnb, - nullptr, + gmx::ArrayRefWithPadding(), nullptr, threadVc, threadVv, -- 2.22.0