Simd4Real gri_S0, gri_S1, gri_S2, gri_S3;
/* With order 4 the z-spline is actually aligned */
- tz_S = load4(thz);
+ tz_S = load4(thz);
for (ithx = 0; (ithx < 4); ithx++)
{
- index_x = (i0+ithx)*pny*pnz;
- valx = coefficient*thx[ithx];
+ index_x = (i0 + ithx) * pny * pnz;
+ valx = coefficient * thx[ithx];
- vx_S = Simd4Real(valx);
+ vx_S = Simd4Real(valx);
vx_tz_S = vx_S * tz_S;
- gri_S0 = load4U(grid+index_x+(j0+0)*pnz+k0);
- gri_S1 = load4U(grid+index_x+(j0+1)*pnz+k0);
- gri_S2 = load4U(grid+index_x+(j0+2)*pnz+k0);
- gri_S3 = load4U(grid+index_x+(j0+3)*pnz+k0);
+ gri_S0 = load4U(grid + index_x + (j0 + 0) * pnz + k0);
+ gri_S1 = load4U(grid + index_x + (j0 + 1) * pnz + k0);
+ gri_S2 = load4U(grid + index_x + (j0 + 2) * pnz + k0);
+ gri_S3 = load4U(grid + index_x + (j0 + 3) * pnz + k0);
sum_S0 = fma(vx_tz_S, ty_S0, gri_S0);
sum_S1 = fma(vx_tz_S, ty_S1, gri_S1);
sum_S2 = fma(vx_tz_S, ty_S2, gri_S2);
sum_S3 = fma(vx_tz_S, ty_S3, gri_S3);
- store4U(grid+index_x+(j0+0)*pnz+k0, sum_S0);
- store4U(grid+index_x+(j0+1)*pnz+k0, sum_S1);
- store4U(grid+index_x+(j0+2)*pnz+k0, sum_S2);
- store4U(grid+index_x+(j0+3)*pnz+k0, sum_S3);
+ store4U(grid + index_x + (j0 + 0) * pnz + k0, sum_S0);
+ store4U(grid + index_x + (j0 + 1) * pnz + k0, sum_S1);
+ store4U(grid + index_x + (j0 + 2) * pnz + k0, sum_S2);
+ store4U(grid + index_x + (j0 + 3) * pnz + k0, sum_S3);
}
}
-#undef PME_SPREAD_SIMD4_ORDER4
+# undef PME_SPREAD_SIMD4_ORDER4
#endif
*/
{
using namespace gmx;
- int offset;
- int index;
- Simd4Real ty_S0(thy[0]);
- Simd4Real ty_S1(thy[1]);
- Simd4Real ty_S2(thy[2]);
- Simd4Real ty_S3(thy[3]);
- Simd4Real tz_S0;
- Simd4Real tz_S1;
- Simd4Real vx_S;
- Simd4Real vx_tz_S0;
- Simd4Real vx_tz_S1;
- Simd4Real sum_S00, sum_S01, sum_S02, sum_S03;
- Simd4Real sum_S10, sum_S11, sum_S12, sum_S13;
- Simd4Real gri_S00, gri_S01, gri_S02, gri_S03;
- Simd4Real gri_S10, gri_S11, gri_S12, gri_S13;
-#if PME_ORDER == 5
- Simd4Real ty_S4(thy[4]);
- Simd4Real sum_S04;
- Simd4Real sum_S14;
- Simd4Real gri_S04;
- Simd4Real gri_S14;
-#endif
+ int offset;
+ int index;
+ Simd4Real ty_S0(thy[0]);
+ Simd4Real ty_S1(thy[1]);
+ Simd4Real ty_S2(thy[2]);
+ Simd4Real ty_S3(thy[3]);
+ Simd4Real tz_S0;
+ Simd4Real tz_S1;
+ Simd4Real vx_S;
+ Simd4Real vx_tz_S0;
+ Simd4Real vx_tz_S1;
+ Simd4Real sum_S00, sum_S01, sum_S02, sum_S03;
+ Simd4Real sum_S10, sum_S11, sum_S12, sum_S13;
+ Simd4Real gri_S00, gri_S01, gri_S02, gri_S03;
+ Simd4Real gri_S10, gri_S11, gri_S12, gri_S13;
+# if PME_ORDER == 5
+ Simd4Real ty_S4(thy[4]);
+ Simd4Real sum_S04;
+ Simd4Real sum_S14;
+ Simd4Real gri_S04;
+ Simd4Real gri_S14;
+# endif
offset = k0 & 3;
-#ifdef PME_SIMD4_UNALIGNED
- tz_S0 = load4U(thz-offset);
- tz_S1 = load4U(thz-offset+4);
-#else
+# ifdef PME_SIMD4_UNALIGNED
+ tz_S0 = load4U(thz - offset);
+ tz_S1 = load4U(thz - offset + 4);
+# else
{
int i;
/* Copy thz to an aligned buffer (unused buffer parts are masked) */
for (i = 0; i < PME_ORDER; i++)
{
- thz_aligned[offset+i] = thz[i];
+ thz_aligned[offset + i] = thz[i];
}
tz_S0 = load4(thz_aligned);
- tz_S1 = load4(thz_aligned+4);
+ tz_S1 = load4(thz_aligned + 4);
}
-#endif
+# endif
tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
for (ithx = 0; (ithx < PME_ORDER); ithx++)
{
- index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
- valx = coefficient*thx[ithx];
+ index = (i0 + ithx) * pny * pnz + j0 * pnz + k0 - offset;
+ valx = coefficient * thx[ithx];
- vx_S = Simd4Real(valx);
+ vx_S = Simd4Real(valx);
vx_tz_S0 = vx_S * tz_S0;
vx_tz_S1 = vx_S * tz_S1;
- gri_S00 = load4(grid+index+0*pnz);
- gri_S01 = load4(grid+index+1*pnz);
- gri_S02 = load4(grid+index+2*pnz);
- gri_S03 = load4(grid+index+3*pnz);
-#if PME_ORDER == 5
- gri_S04 = load4(grid+index+4*pnz);
-#endif
- gri_S10 = load4(grid+index+0*pnz+4);
- gri_S11 = load4(grid+index+1*pnz+4);
- gri_S12 = load4(grid+index+2*pnz+4);
- gri_S13 = load4(grid+index+3*pnz+4);
-#if PME_ORDER == 5
- gri_S14 = load4(grid+index+4*pnz+4);
-#endif
+ gri_S00 = load4(grid + index + 0 * pnz);
+ gri_S01 = load4(grid + index + 1 * pnz);
+ gri_S02 = load4(grid + index + 2 * pnz);
+ gri_S03 = load4(grid + index + 3 * pnz);
+# if PME_ORDER == 5
+ gri_S04 = load4(grid + index + 4 * pnz);
+# endif
+ gri_S10 = load4(grid + index + 0 * pnz + 4);
+ gri_S11 = load4(grid + index + 1 * pnz + 4);
+ gri_S12 = load4(grid + index + 2 * pnz + 4);
+ gri_S13 = load4(grid + index + 3 * pnz + 4);
+# if PME_ORDER == 5
+ gri_S14 = load4(grid + index + 4 * pnz + 4);
+# endif
sum_S00 = fma(vx_tz_S0, ty_S0, gri_S00);
sum_S01 = fma(vx_tz_S0, ty_S1, gri_S01);
sum_S02 = fma(vx_tz_S0, ty_S2, gri_S02);
sum_S03 = fma(vx_tz_S0, ty_S3, gri_S03);
-#if PME_ORDER == 5
+# if PME_ORDER == 5
sum_S04 = fma(vx_tz_S0, ty_S4, gri_S04);
-#endif
+# endif
sum_S10 = fma(vx_tz_S1, ty_S0, gri_S10);
sum_S11 = fma(vx_tz_S1, ty_S1, gri_S11);
sum_S12 = fma(vx_tz_S1, ty_S2, gri_S12);
sum_S13 = fma(vx_tz_S1, ty_S3, gri_S13);
-#if PME_ORDER == 5
+# if PME_ORDER == 5
sum_S14 = fma(vx_tz_S1, ty_S4, gri_S14);
-#endif
-
- store4(grid+index+0*pnz, sum_S00);
- store4(grid+index+1*pnz, sum_S01);
- store4(grid+index+2*pnz, sum_S02);
- store4(grid+index+3*pnz, sum_S03);
-#if PME_ORDER == 5
- store4(grid+index+4*pnz, sum_S04);
-#endif
- store4(grid+index+0*pnz+4, sum_S10);
- store4(grid+index+1*pnz+4, sum_S11);
- store4(grid+index+2*pnz+4, sum_S12);
- store4(grid+index+3*pnz+4, sum_S13);
-#if PME_ORDER == 5
- store4(grid+index+4*pnz+4, sum_S14);
-#endif
+# endif
+
+ store4(grid + index + 0 * pnz, sum_S00);
+ store4(grid + index + 1 * pnz, sum_S01);
+ store4(grid + index + 2 * pnz, sum_S02);
+ store4(grid + index + 3 * pnz, sum_S03);
+# if PME_ORDER == 5
+ store4(grid + index + 4 * pnz, sum_S04);
+# endif
+ store4(grid + index + 0 * pnz + 4, sum_S10);
+ store4(grid + index + 1 * pnz + 4, sum_S11);
+ store4(grid + index + 2 * pnz + 4, sum_S12);
+ store4(grid + index + 3 * pnz + 4, sum_S13);
+# if PME_ORDER == 5
+ store4(grid + index + 4 * pnz + 4, sum_S14);
+# endif
}
}
-#undef PME_ORDER
-#undef PME_SPREAD_SIMD4_ALIGNED
+# undef PME_ORDER
+# undef PME_SPREAD_SIMD4_ALIGNED
#endif