From d25987124d088238be6ade218e742b59b0625755 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 28 Aug 2020 22:34:40 +0000 Subject: [PATCH] Add SIMD version of harmonic bonds Change-Id: I63ea7d9b76127112c221a9b6910604a09bbb0bb7 --- docs/release-notes/2021/major/performance.rst | 7 + src/gromacs/listed_forces/bonded.cpp | 124 ++++++++++++++++-- 2 files changed, 118 insertions(+), 13 deletions(-) diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst index d486d0c308..b94c016d48 100644 --- a/docs/release-notes/2021/major/performance.rst +++ b/docs/release-notes/2021/major/performance.rst @@ -25,3 +25,10 @@ Support for offloading PME to GPU when doing Coulomb FEP """""""""""""""""""""""""""""""""""""""""""""""""""""""" PME calculations can be offloaded to GPU when doing Coulomb free-energy perturbations. + +CPU SIMD accelerated implementation of harmonic bonds +""""""""""""""""""""""""""""""""""""""""""""""""""""" + +SIMD acceleration for bonds slightly improves performance for systems +with H-bonds only constrained or no constraints. This gives a significant +improvement with multiple time stepping. diff --git a/src/gromacs/listed_forces/bonded.cpp b/src/gromacs/listed_forces/bonded.cpp index 113c5efd65..93a8bcd2cd 100644 --- a/src/gromacs/listed_forces/bonded.cpp +++ b/src/gromacs/listed_forces/bonded.cpp @@ -445,20 +445,20 @@ real harmonic(real kA, real kB, real xA, real xB, real x, real lambda, real* V, /* That was 19 flops */ } - template -real bonds(int nbonds, - const t_iatom forceatoms[], - const t_iparams forceparams[], - const rvec x[], - rvec4 f[], - rvec fshift[], - const t_pbc* pbc, - real lambda, - real* dvdlambda, - const t_mdatoms gmx_unused* md, - t_fcdata gmx_unused* fcd, - int gmx_unused* global_atom_index) +std::enable_if_t +bonds(int nbonds, + const t_iatom forceatoms[], + const t_iparams forceparams[], + const rvec x[], + rvec4 f[], + rvec fshift[], + const t_pbc* pbc, + real lambda, + real* dvdlambda, + const t_mdatoms gmx_unused* md, + t_fcdata gmx_unused* fcd, + int gmx_unused* global_atom_index) { int i, ki, ai, aj, type; real dr, dr2, fbond, vbond, vtot; @@ -493,6 +493,104 @@ real bonds(int nbonds, return vtot; } +#if GMX_SIMD_HAVE_REAL + +/*! \brief Computes forces (only) for harmonic bonds using SIMD intrinsics + * + * As plain-C bonds(), but using SIMD to calculate many bonds at once. + * This routines does not calculate energies and shift forces. + */ +template +std::enable_if_t +bonds(int nbonds, + const t_iatom forceatoms[], + const t_iparams forceparams[], + const rvec x[], + rvec4 f[], + rvec gmx_unused fshift[], + const t_pbc* pbc, + real gmx_unused lambda, + real gmx_unused* dvdlambda, + const t_mdatoms gmx_unused* md, + t_fcdata gmx_unused* fcd, + int gmx_unused* global_atom_index) +{ + constexpr int nfa1 = 3; + alignas(GMX_SIMD_ALIGNMENT) std::int32_t ai[GMX_SIMD_REAL_WIDTH]; + alignas(GMX_SIMD_ALIGNMENT) std::int32_t aj[GMX_SIMD_REAL_WIDTH]; + alignas(GMX_SIMD_ALIGNMENT) real coeff[2 * GMX_SIMD_REAL_WIDTH]; + + alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9 * GMX_SIMD_REAL_WIDTH]; + + set_pbc_simd(pbc, pbc_simd); + + const SimdReal real_eps = SimdReal(GMX_REAL_EPS); + + /* nbonds is the number of bonds times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */ + for (int i = 0; i < nbonds; i += GMX_SIMD_REAL_WIDTH * nfa1) + { + /* Collect atoms for GMX_SIMD_REAL_WIDTH angles. + * iu indexes into forceatoms, we should not let iu go beyond nbonds. + */ + int iu = i; + for (int s = 0; s < GMX_SIMD_REAL_WIDTH; s++) + { + const int type = forceatoms[iu]; + ai[s] = forceatoms[iu + 1]; + aj[s] = forceatoms[iu + 2]; + + /* At the end fill the arrays with the last atoms and 0 params */ + if (i + s * nfa1 < nbonds) + { + coeff[s] = forceparams[type].harmonic.krA; + coeff[GMX_SIMD_REAL_WIDTH + s] = forceparams[type].harmonic.rA; + + if (iu + nfa1 < nbonds) + { + iu += nfa1; + } + } + else + { + coeff[s] = 0; + coeff[GMX_SIMD_REAL_WIDTH + s] = 0; + } + } + + /* Store the non PBC corrected distances packed and aligned */ + SimdReal xi, yi, zi; + SimdReal xj, yj, zj; + gatherLoadUTranspose<3>(reinterpret_cast(x), ai, &xi, &yi, &zi); + gatherLoadUTranspose<3>(reinterpret_cast(x), aj, &xj, &yj, &zj); + SimdReal rij_x = xi - xj; + SimdReal rij_y = yi - yj; + SimdReal rij_z = zi - zj; + + pbc_correct_dx_simd(&rij_x, &rij_y, &rij_z, pbc_simd); + + const SimdReal dist2 = rij_x * rij_x + rij_y * rij_y + rij_z * rij_z; + // Here we avoid sqrt(0), the force will be zero because rij=0 + const SimdReal invDist = invsqrt(max(dist2, real_eps)); + + const SimdReal k = load(coeff); + const SimdReal r0 = load(coeff + GMX_SIMD_REAL_WIDTH); + + // Compute the force divided by the distance + const SimdReal forceOverR = -k * (dist2 * invDist - r0) * invDist; + + const SimdReal f_x = forceOverR * rij_x; + const SimdReal f_y = forceOverR * rij_y; + const SimdReal f_z = forceOverR * rij_z; + + transposeScatterIncrU<4>(reinterpret_cast(f), ai, f_x, f_y, f_z); + transposeScatterDecrU<4>(reinterpret_cast(f), aj, f_x, f_y, f_z); + } + + return 0; +} + +#endif // GMX_SIMD_HAVE_REAL + template real restraint_bonds(int nbonds, const t_iatom forceatoms[], -- 2.22.0