*/
/* The macros in this file are intended to be used for writing
- * architecture independent SIMD intrinsics code.
+ * architecture-independent SIMD intrinsics code.
* To support a new architecture, adding macros here should be (nearly)
* all that is needed.
*/
* with different settings from the same source file.
*/
-/* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
+/* NOTE: SSE2 acceleration does not include floor or blendv */
#undef GMX_SIMD_WIDTH_HERE
real *nbfp; /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */
int comb_rule; /* Combination rule, see enum above */
real *nbfp_comb; /* LJ parameter per atom type, size ntype*2 */
- real *nbfp_s4; /* As nbfp, but with stride 4, size ntype^2*4 */
+ real *nbfp_s4; /* As nbfp, but with stride 4, size ntype^2*4. This
+ * might suit 4-wide SIMD loads of two values (e.g.
+ * two floats in single precision on x86). */
int natoms; /* Number of atoms */
int natoms_local; /* Number of local atoms */
int *type; /* Atom types */
}
break;
case ljcrNONE:
- /* In nbfp_s4 we use a stride of 4 for storing two parameters */
+ /* nbfp_s4 stores two parameters using a stride of 4,
+ * because this would suit x86 SIMD single-precision
+ * quad-load intrinsics. There's a slight inefficiency in
+ * allocating and initializing nbfp_s4 when it might not
+ * be used, but introducing the conditional code is not
+ * really worth it. */
nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
for (i = 0; i < nt; i++)
{