*/
struct do_fspline
{
- do_fspline (
- const gmx_pme_t * pme,
- const real * gmx_restrict grid,
- const PmeAtomComm * gmx_restrict atc,
- const splinedata_t * gmx_restrict spline,
- int nn)
- : pme(pme), grid(grid), atc(atc), spline(spline), nn(nn) {}
-
- template <typename Int>
+ do_fspline(const gmx_pme_t* pme,
+ const real* gmx_restrict grid,
+ const PmeAtomComm* gmx_restrict atc,
+ const splinedata_t* gmx_restrict spline,
+ int nn) :
+ pme(pme),
+ grid(grid),
+ atc(atc),
+ spline(spline),
+ nn(nn)
+ {
+ }
+
+ template<typename Int>
RVec operator()(Int order) const
{
static_assert(isIntegralConstant<Int, int>::value || std::is_same<Int, int>::value,
"'order' needs to be either of type integral_constant<int,N> or int.");
- const int norder = nn*order;
+ const int norder = nn * order;
/* Pointer arithmetic alert, next six statements */
- const real *const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
- const real *const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
- const real *const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
- const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
- const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
- const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
+ const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
+ const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
+ const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
+ const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
- RVec f(0, 0, 0);
+ RVec f(0, 0, 0);
for (int ithx = 0; (ithx < order); ithx++)
{
- const int index_x = (idxX + ithx)*gridNY*gridNZ;
+ const int index_x = (idxX + ithx) * gridNY * gridNZ;
const real tx = thx[ithx];
const real dx = dthx[ithx];
for (int ithy = 0; (ithy < order); ithy++)
{
- const int index_xy = index_x + (idxY + ithy)*gridNZ;
+ const int index_xy = index_x + (idxY + ithy) * gridNZ;
const real ty = thy[ithy];
const real dy = dthy[ithy];
- real fxy1 = 0, fz1 = 0;
+ real fxy1 = 0, fz1 = 0;
for (int ithz = 0; (ithz < order); ithz++)
{
const real gval = grid[index_xy + (idxZ + ithz)];
- fxy1 += thz[ithz]*gval;
- fz1 += dthz[ithz]*gval;
+ fxy1 += thz[ithz] * gval;
+ fz1 += dthz[ithz] * gval;
}
- f[XX] += dx*ty*fxy1;
- f[YY] += tx*dy*fxy1;
- f[ZZ] += tx*ty*fz1;
+ f[XX] += dx * ty * fxy1;
+ f[YY] += tx * dy * fxy1;
+ f[ZZ] += tx * ty * fz1;
}
}
return f;
}
-//TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
+// TODO: Consider always have at least a dummy implementation of Simd (enough for first phase of two-phase lookup) and then use enable_if instead of #ifdef
#if PME_4NSIMD_GATHER
-/* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
- * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
- * This code does not assume any memory alignment for the grid.
- */
- RVec
- operator()(std::integral_constant<int, 4> /*unused*/) const
+ /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
+ * Uses 4N SIMD where N is SIMD_WIDTH/4 to operate on all of z and N of y.
+ * This code does not assume any memory alignment for the grid.
+ */
+ RVec operator()(std::integral_constant<int, 4> /*unused*/) const
{
- const int norder = nn*4;
+ const int norder = nn * 4;
/* Pointer arithmetic alert, next six statements */
- const real *const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
- const real *const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
- const real *const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
- const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
- const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
- const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
+ const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
+ const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
+ const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
+ const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
- Simd4NReal fx_S = setZero();
- Simd4NReal fy_S = setZero();
- Simd4NReal fz_S = setZero();
+ Simd4NReal fx_S = setZero();
+ Simd4NReal fy_S = setZero();
+ Simd4NReal fz_S = setZero();
/* With order 4 the z-spline is actually aligned */
const Simd4NReal tz_S = load4DuplicateN(thz);
for (int ithx = 0; ithx < 4; ithx++)
{
- const int index_x = (idxX + ithx)*gridNY*gridNZ;
+ const int index_x = (idxX + ithx) * gridNY * gridNZ;
const Simd4NReal tx_S = Simd4NReal(thx[ithx]);
const Simd4NReal dx_S = Simd4NReal(dthx[ithx]);
- for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH/4)
+ for (int ithy = 0; ithy < 4; ithy += GMX_SIMD4N_REAL_WIDTH / 4)
{
- const int index_xy = index_x + (idxY+ithy)*gridNZ;
+ const int index_xy = index_x + (idxY + ithy) * gridNZ;
- const Simd4NReal ty_S = loadUNDuplicate4(thy +ithy);
- const Simd4NReal dy_S = loadUNDuplicate4(dthy+ithy);
+ const Simd4NReal ty_S = loadUNDuplicate4(thy + ithy);
+ const Simd4NReal dy_S = loadUNDuplicate4(dthy + ithy);
- const Simd4NReal gval_S = loadU4NOffset(grid+index_xy+idxZ, gridNZ);
+ const Simd4NReal gval_S = loadU4NOffset(grid + index_xy + idxZ, gridNZ);
const Simd4NReal fxy1_S = tz_S * gval_S;
}
}
- return {
- reduce(fx_S), reduce(fy_S), reduce(fz_S)
- };
+ return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
}
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
-/* Load order elements from unaligned memory into two 4-wide SIMD */
+ /* Load order elements from unaligned memory into two 4-wide SIMD */
template<int order>
- static inline void loadOrderU(const real* data, std::integral_constant<int, order> /*unused*/,
- int offset, Simd4Real* S0, Simd4Real* S1)
+ static inline void loadOrderU(const real* data,
+ std::integral_constant<int, order> /*unused*/,
+ int offset,
+ Simd4Real* S0,
+ Simd4Real* S1)
{
-#ifdef PME_SIMD4_UNALIGNED
- *S0 = load4U(data-offset);
- *S1 = load4U(data-offset+4);
-#else
- alignas(GMX_SIMD_ALIGNMENT) real buf_aligned[GMX_SIMD4_WIDTH*2];
+# ifdef PME_SIMD4_UNALIGNED
+ *S0 = load4U(data - offset);
+ *S1 = load4U(data - offset + 4);
+# else
+ alignas(GMX_SIMD_ALIGNMENT) real buf_aligned[GMX_SIMD4_WIDTH * 2];
/* Copy data to an aligned buffer */
for (int i = 0; i < order; i++)
{
- buf_aligned[offset+i] = data[i];
+ buf_aligned[offset + i] = data[i];
}
*S0 = load4(buf_aligned);
- *S1 = load4(buf_aligned+4);
-#endif
+ *S1 = load4(buf_aligned + 4);
+# endif
}
#endif
#ifdef PME_SIMD4_SPREAD_GATHER
-/* This code assumes that the grid is allocated 4-real aligned
- * and that pme->pmegrid_nz is a multiple of 4.
- * This code supports pme_order <= 5.
- */
- template <int Order>
- std::enable_if_t<Order == 4 || Order == 5, RVec>
- operator()(std::integral_constant<int, Order> order) const
+ /* This code assumes that the grid is allocated 4-real aligned
+ * and that pme->pmegrid_nz is a multiple of 4.
+ * This code supports pme_order <= 5.
+ */
+ template<int Order>
+ std::enable_if_t<Order == 4 || Order == 5, RVec> operator()(std::integral_constant<int, Order> order) const
{
- const int norder = nn*order;
- GMX_ASSERT(gridNZ % 4 == 0, "For aligned SIMD4 operations the grid size has to be padded up to a multiple of 4");
+ const int norder = nn * order;
+ GMX_ASSERT(gridNZ % 4 == 0,
+ "For aligned SIMD4 operations the grid size has to be padded up to a multiple "
+ "of 4");
/* Pointer arithmetic alert, next six statements */
- const real *const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
- const real *const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
- const real *const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
- const real *const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
- const real *const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
- const real *const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict thx = spline->theta.coefficients[XX] + norder;
+ const real* const gmx_restrict thy = spline->theta.coefficients[YY] + norder;
+ const real* const gmx_restrict thz = spline->theta.coefficients[ZZ] + norder;
+ const real* const gmx_restrict dthx = spline->dtheta.coefficients[XX] + norder;
+ const real* const gmx_restrict dthy = spline->dtheta.coefficients[YY] + norder;
+ const real* const gmx_restrict dthz = spline->dtheta.coefficients[ZZ] + norder;
- struct pme_spline_work *const work = pme->spline_work;
+ struct pme_spline_work* const work = pme->spline_work;
- const int offset = idxZ & 3;
+ const int offset = idxZ & 3;
- Simd4Real fx_S = setZero();
- Simd4Real fy_S = setZero();
- Simd4Real fz_S = setZero();
+ Simd4Real fx_S = setZero();
+ Simd4Real fy_S = setZero();
+ Simd4Real fz_S = setZero();
- Simd4Real tz_S0, tz_S1, dz_S0, dz_S1;
- loadOrderU(thz, order, offset, &tz_S0, &tz_S1);
+ Simd4Real tz_S0, tz_S1, dz_S0, dz_S1;
+ loadOrderU(thz, order, offset, &tz_S0, &tz_S1);
loadOrderU(dthz, order, offset, &dz_S0, &dz_S1);
tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
for (int ithx = 0; (ithx < order); ithx++)
{
- const int index_x = (idxX + ithx)*gridNY*gridNZ;
- const Simd4Real tx_S = Simd4Real(thx[ithx]);
- const Simd4Real dx_S = Simd4Real(dthx[ithx]);
+ const int index_x = (idxX + ithx) * gridNY * gridNZ;
+ const Simd4Real tx_S = Simd4Real(thx[ithx]);
+ const Simd4Real dx_S = Simd4Real(dthx[ithx]);
for (int ithy = 0; (ithy < order); ithy++)
{
- const int index_xy = index_x + (idxY + ithy)*gridNZ;
+ const int index_xy = index_x + (idxY + ithy) * gridNZ;
const Simd4Real ty_S = Simd4Real(thy[ithy]);
const Simd4Real dy_S = Simd4Real(dthy[ithy]);
}
}
- return {
- reduce(fx_S), reduce(fy_S), reduce(fz_S)
- };
+ return { reduce(fx_S), reduce(fy_S), reduce(fz_S) };
}
#endif
- private:
- const gmx_pme_t *const pme;
- const real *const gmx_restrict grid;
- const PmeAtomComm *const gmx_restrict atc;
- const splinedata_t *const gmx_restrict spline;
- const int nn;
-
- const int gridNY = pme->pmegrid_ny;
- const int gridNZ = pme->pmegrid_nz;
-
- const int *const idxptr = atc->idx[spline->ind[nn]];
- const int idxX = idxptr[XX];
- const int idxY = idxptr[YY];
- const int idxZ = idxptr[ZZ];
+private:
+ const gmx_pme_t* const pme;
+ const real* const gmx_restrict grid;
+ const PmeAtomComm* const gmx_restrict atc;
+ const splinedata_t* const gmx_restrict spline;
+ const int nn;
+
+ const int gridNY = pme->pmegrid_ny;
+ const int gridNZ = pme->pmegrid_nz;
+
+ const int* const idxptr = atc->idx[spline->ind[nn]];
+ const int idxX = idxptr[XX];
+ const int idxY = idxptr[YY];
+ const int idxZ = idxptr[ZZ];
};
-void gather_f_bsplines(const gmx_pme_t *pme, const real *grid,
- gmx_bool bClearF, const PmeAtomComm *atc,
- const splinedata_t *spline,
- real scale)
+void gather_f_bsplines(const gmx_pme_t* pme,
+ const real* grid,
+ gmx_bool bClearF,
+ const PmeAtomComm* atc,
+ const splinedata_t* spline,
+ real scale)
{
/* sum forces for local particles */
- const int order = pme->pme_order;
- const int nx = pme->nkx;
- const int ny = pme->nky;
- const int nz = pme->nkz;
+ const int order = pme->pme_order;
+ const int nx = pme->nkx;
+ const int ny = pme->nky;
+ const int nz = pme->nkz;
- const real rxx = pme->recipbox[XX][XX];
- const real ryx = pme->recipbox[YY][XX];
- const real ryy = pme->recipbox[YY][YY];
- const real rzx = pme->recipbox[ZZ][XX];
- const real rzy = pme->recipbox[ZZ][YY];
- const real rzz = pme->recipbox[ZZ][ZZ];
+ const real rxx = pme->recipbox[XX][XX];
+ const real ryx = pme->recipbox[YY][XX];
+ const real ryy = pme->recipbox[YY][YY];
+ const real rzx = pme->recipbox[ZZ][XX];
+ const real rzy = pme->recipbox[ZZ][YY];
+ const real rzz = pme->recipbox[ZZ][ZZ];
/* Extract the buffer for force output */
- rvec * gmx_restrict force = as_rvec_array(atc->f.data());
+ rvec* gmx_restrict force = as_rvec_array(atc->f.data());
/* Note that unrolling this loop by templating this function on order
* deteriorates performance significantly with gcc5/6/7.
for (int nn = 0; nn < spline->n; nn++)
{
const int n = spline->ind[nn];
- const real coefficient = scale*atc->coefficient[n];
+ const real coefficient = scale * atc->coefficient[n];
if (bClearF)
{
switch (order)
{
- case 4:
- f = spline_func(std::integral_constant<int, 4>());
- break;
- case 5:
- f = spline_func(std::integral_constant<int, 5>());
- break;
- default:
- f = spline_func(order);
- break;
+ case 4: f = spline_func(std::integral_constant<int, 4>()); break;
+ case 5: f = spline_func(std::integral_constant<int, 5>()); break;
+ default: f = spline_func(order); break;
}
- force[n][XX] += -coefficient*( f[XX]*nx*rxx );
- force[n][YY] += -coefficient*( f[XX]*nx*ryx + f[YY]*ny*ryy );
- force[n][ZZ] += -coefficient*( f[XX]*nx*rzx + f[YY]*ny*rzy + f[ZZ]*nz*rzz );
+ force[n][XX] += -coefficient * (f[XX] * nx * rxx);
+ force[n][YY] += -coefficient * (f[XX] * nx * ryx + f[YY] * ny * ryy);
+ force[n][ZZ] += -coefficient * (f[XX] * nx * rzx + f[YY] * ny * rzy + f[ZZ] * nz * rzz);
}
}
/* Since the energy and not forces are interpolated
}
-real gather_energy_bsplines(gmx_pme_t *pme, const real *grid,
- PmeAtomComm *atc)
+real gather_energy_bsplines(gmx_pme_t* pme, const real* grid, PmeAtomComm* atc)
{
- splinedata_t *spline;
+ splinedata_t* spline;
int ithx, ithy, ithz, i0, j0, k0;
int index_x, index_xy;
- int *idxptr;
+ int* idxptr;
real energy, pot, tx, ty, coefficient, gval;
- real *thx, *thy, *thz;
+ real * thx, *thy, *thz;
int norder;
int order;
energy = 0;
for (int n = 0; n < atc->numAtoms(); n++)
{
- coefficient = atc->coefficient[n];
+ coefficient = atc->coefficient[n];
if (coefficient != 0)
{
idxptr = atc->idx[n];
- norder = n*order;
+ norder = n * order;
- i0 = idxptr[XX];
- j0 = idxptr[YY];
- k0 = idxptr[ZZ];
+ i0 = idxptr[XX];
+ j0 = idxptr[YY];
+ k0 = idxptr[ZZ];
/* Pointer arithmetic alert, next three statements */
- thx = spline->theta.coefficients[XX] + norder;
- thy = spline->theta.coefficients[YY] + norder;
- thz = spline->theta.coefficients[ZZ] + norder;
+ thx = spline->theta.coefficients[XX] + norder;
+ thy = spline->theta.coefficients[YY] + norder;
+ thz = spline->theta.coefficients[ZZ] + norder;
pot = 0;
for (ithx = 0; (ithx < order); ithx++)
{
- index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
+ index_x = (i0 + ithx) * pme->pmegrid_ny * pme->pmegrid_nz;
tx = thx[ithx];
for (ithy = 0; (ithy < order); ithy++)
{
- index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
+ index_xy = index_x + (j0 + ithy) * pme->pmegrid_nz;
ty = thy[ithy];
for (ithz = 0; (ithz < order); ithz++)
{
- gval = grid[index_xy+(k0+ithz)];
- pot += tx*ty*thz[ithz]*gval;
+ gval = grid[index_xy + (k0 + ithz)];
+ pot += tx * ty * thz[ithz] * gval;
}
-
}
}
- energy += pot*coefficient;
+ energy += pot * coefficient;
}
}