* CALCULATE INTERACTIONS *
**************************/
+ if (rsq00<rcutoff2)
+ {
+
r00 = rsq00*rinv00;
/* Calculate table index by multiplying r with table scale and truncate to integer */
f[j_coord_offset+DIM*0+YY] -= ty;
f[j_coord_offset+DIM*0+ZZ] -= tz;
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
* CALCULATE INTERACTIONS *
**************************/
+ if (rsq00<rcutoff2)
+ {
+
r00 = rsq00*rinv00;
/* Calculate table index by multiplying r with table scale and truncate to integer */
f[j_coord_offset+DIM*0+YY] -= ty;
f[j_coord_offset+DIM*0+ZZ] -= tz;
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
* CALCULATE INTERACTIONS *
**************************/
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
if (rsq{I}{J}<rcutoff2)
{
/* #define INNERFLOPS INNERFLOPS+9 */
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* #if 0 ## This and next two lines is a hack to maintain indentation in template file */
{
/* #endif */
*/
-static void
-gmx_mm_load_1rvec_broadcast_ps(float *ptrA, __m128 *x, __m128 *y, __m128 *z)
+static gmx_inline void
+gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
- __m128 t1;
-
- t1 = _mm_loadu_ps(ptrA);
-
- *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ __m128 t1,t2,t3,t4;
+
+ t1 = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ t2 = _mm_castpd_ps(_mm_load_sd((const double *)xyz));
+ t3 = _mm_load_ss(xyz_shift+2);
+ t4 = _mm_load_ss(xyz+2);
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ss(t3,t4);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
}
-static void
-gmx_mm_load_3rvec_broadcast_ps(float *ptrA,
- __m128 *x1, __m128 *y1, __m128 *z1,
- __m128 *x2, __m128 *y2, __m128 *z2,
- __m128 *x3, __m128 *y3, __m128 *z3)
-{
- __m128 t1,t2,t3;
-
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
- *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
- *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
- *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
- *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
- *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
-
- t3 = _mm_load_ss(ptrA+8);
- *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+static gmx_inline void
+gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+{
+ __m128 tA,tB;
+ __m128 t1,t2,t3,t4,t5,t6;
+
+ tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ tB = _mm_load_ss(xyz_shift+2);
+
+ t1 = _mm_loadu_ps(xyz);
+ t2 = _mm_loadu_ps(xyz+4);
+ t3 = _mm_load_ss(xyz+8);
+
+ tA = _mm_movelh_ps(tA,tB);
+ t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
+ t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
+ t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
+
+ t1 = _mm_add_ps(t1,t4);
+ t2 = _mm_add_ps(t2,t5);
+ t3 = _mm_add_ss(t3,t6);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+ *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
}
-static void
-gmx_mm_load_4rvec_broadcast_ps(float *ptrA,
- __m128 *x1, __m128 *y1, __m128 *z1,
- __m128 *x2, __m128 *y2, __m128 *z2,
- __m128 *x3, __m128 *y3, __m128 *z3,
- __m128 *x4, __m128 *y4, __m128 *z4)
-{
- __m128 t1,t2,t3;
- __m128 tA;
-
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_loadu_ps(ptrA+8);
-
- *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
- *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
- *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
- *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
- *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
- *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
- *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
- *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
- *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
- *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
+
+static gmx_inline void
+gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+{
+ __m128 tA,tB;
+ __m128 t1,t2,t3,t4,t5,t6;
+
+ tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ tB = _mm_load_ss(xyz_shift+2);
+
+ t1 = _mm_loadu_ps(xyz);
+ t2 = _mm_loadu_ps(xyz+4);
+ t3 = _mm_loadu_ps(xyz+8);
+
+ tA = _mm_movelh_ps(tA,tB);
+ t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
+ t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
+ t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
+
+ t1 = _mm_add_ps(t1,t4);
+ t2 = _mm_add_ps(t2,t5);
+ t3 = _mm_add_ps(t3,t6);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+ *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
+ *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
+ *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
}
static void
-gmx_mm_load_2rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
- __m128 *x1, __m128 *y1, __m128 *z1,
- __m128 *x2, __m128 *y2, __m128 *z2)
-{
- __m128 t1,t2,t3,t4;
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrB);
- t3 = _mm_loadu_ps(ptrC);
- t4 = _mm_loadu_ps(ptrD);
- _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
- *x1 = t1;
- *y1 = t2;
- *z1 = t3;
- *x2 = t4;
- t1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrA+4));
- t2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrB+4));
- t3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrC+4));
- t4 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptrD+4));
- t1 = _mm_unpacklo_ps(t1,t3);
- t2 = _mm_unpacklo_ps(t2,t4);
- *y2 = _mm_unpacklo_ps(t1,t2);
- *z2 = _mm_unpackhi_ps(t1,t2);
-}
-
-
-static void
-gmx_mm_load_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
- __m128 *x1, __m128 *y1, __m128 *z1,
- __m128 *x2, __m128 *y2, __m128 *z2,
- __m128 *x3, __m128 *y3, __m128 *z3)
+gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
+ const float * gmx_restrict ptrB,
+ const float * gmx_restrict ptrC,
+ const float * gmx_restrict ptrD,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
static void
-gmx_mm_load_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
- __m128 *x1, __m128 *y1, __m128 *z1,
- __m128 *x2, __m128 *y2, __m128 *z2,
- __m128 *x3, __m128 *y3, __m128 *z3,
- __m128 *x4, __m128 *y4, __m128 *z4)
+gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
+ const float * gmx_restrict ptrB,
+ const float * gmx_restrict ptrC,
+ const float * gmx_restrict ptrD,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
__m128 t1,t2,t3,t4;
t1 = _mm_loadu_ps(ptrA);
}
-/* Routines to increment rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
-{
- __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
- __m128 t1;
-
- t1 = _mm_loadu_ps(ptrA);
- xyz = _mm_and_ps(mask,xyz);
- t1 = _mm_add_ps(t1,xyz);
- _mm_storeu_ps(ptrA,t1);
-}
-
-
-static void
-gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float *ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3)
-{
- __m128 t1,t2,t3,t4;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_load_ss(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
- t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
-
- t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
-
- tA = _mm_add_ps(tA,t3);
- tB = _mm_add_ps(tB,t4);
- tC = _mm_add_ss(tC,t2);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_store_ss(ptrA+8,tC);
-}
-
-static void
-gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float *ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
-{
- __m128 t1,t2,t3,t4,t5;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_loadu_ps(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
- t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
-
- t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
- t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
-
- t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
- t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
-
- tA = _mm_add_ps(tA,t3);
- tB = _mm_add_ps(tB,t4);
- tC = _mm_add_ps(tC,t5);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_storeu_ps(ptrA+8,tC);
-
-}
-
-
-
-static void
-gmx_mm_increment_1rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC,float *ptrD,
- __m128 x1, __m128 y1, __m128 z1)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
- t5 = _mm_unpacklo_ps(y1,z1);
- t6 = _mm_unpackhi_ps(y1,z1);
- t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
- t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
- t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
- t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
- t1 = _mm_load_ss(ptrA);
- t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
- t1 = _mm_add_ps(t1,t7);
- _mm_store_ss(ptrA,t1);
- _mm_storeh_pi((__m64 *)(ptrA+1),t1);
- t2 = _mm_load_ss(ptrB);
- t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
- t2 = _mm_add_ps(t2,t8);
- _mm_store_ss(ptrB,t2);
- _mm_storeh_pi((__m64 *)(ptrB+1),t2);
- t3 = _mm_load_ss(ptrC);
- t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
- t3 = _mm_add_ps(t3,t9);
- _mm_store_ss(ptrC,t3);
- _mm_storeh_pi((__m64 *)(ptrC+1),t3);
- t4 = _mm_load_ss(ptrD);
- t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
- t4 = _mm_add_ps(t4,t10);
- _mm_store_ss(ptrD,t4);
- _mm_storeh_pi((__m64 *)(ptrD+1),t4);
-}
-
-
-
-
-static void
-gmx_mm_increment_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
- __m128 x1, __m128 y1, __m128 z1,
- __m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
- __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
- __m128 t20,t21,t22,t23,t24,t25;
-
- t13 = _mm_unpackhi_ps(x1,y1);
- x1 = _mm_unpacklo_ps(x1,y1);
- t14 = _mm_unpackhi_ps(z1,x2);
- z1 = _mm_unpacklo_ps(z1,x2);
- t15 = _mm_unpackhi_ps(y2,z2);
- y2 = _mm_unpacklo_ps(y2,z2);
- t16 = _mm_unpackhi_ps(x3,y3);
- x3 = _mm_unpacklo_ps(x3,y3);
- t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
- t18 = _mm_movehl_ps(z3,z3);
- t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
- t20 = _mm_movelh_ps(x1,z1);
- t21 = _mm_movehl_ps(z1,x1);
- t22 = _mm_movelh_ps(t13,t14);
- t14 = _mm_movehl_ps(t14,t13);
- t23 = _mm_movelh_ps(y2,x3);
- t24 = _mm_movehl_ps(x3,y2);
- t25 = _mm_movelh_ps(t15,t16);
- t16 = _mm_movehl_ps(t16,t15);
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_load_ss(ptrA+8);
- t1 = _mm_add_ps(t1,t20);
- t2 = _mm_add_ps(t2,t23);
- t3 = _mm_add_ss(t3,z3);
- _mm_storeu_ps(ptrA,t1);
- _mm_storeu_ps(ptrA+4,t2);
- _mm_store_ss(ptrA+8,t3);
- t4 = _mm_loadu_ps(ptrB);
- t5 = _mm_loadu_ps(ptrB+4);
- t6 = _mm_load_ss(ptrB+8);
- t4 = _mm_add_ps(t4,t21);
- t5 = _mm_add_ps(t5,t24);
- t6 = _mm_add_ss(t6,t17);
- _mm_storeu_ps(ptrB,t4);
- _mm_storeu_ps(ptrB+4,t5);
- _mm_store_ss(ptrB+8,t6);
- t7 = _mm_loadu_ps(ptrC);
- t8 = _mm_loadu_ps(ptrC+4);
- t9 = _mm_load_ss(ptrC+8);
- t7 = _mm_add_ps(t7,t22);
- t8 = _mm_add_ps(t8,t25);
- t9 = _mm_add_ss(t9,t18);
- _mm_storeu_ps(ptrC,t7);
- _mm_storeu_ps(ptrC+4,t8);
- _mm_store_ss(ptrC+8,t9);
- t10 = _mm_loadu_ps(ptrD);
- t11 = _mm_loadu_ps(ptrD+4);
- t12 = _mm_load_ss(ptrD+8);
- t10 = _mm_add_ps(t10,t14);
- t11 = _mm_add_ps(t11,t16);
- t12 = _mm_add_ss(t12,t19);
- _mm_storeu_ps(ptrD,t10);
- _mm_storeu_ps(ptrD+4,t11);
- _mm_store_ss(ptrD+8,t12);
-}
-
-
-static void
-gmx_mm_increment_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
- __m128 x1, __m128 y1, __m128 z1,
- __m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
- __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
- __m128 t23,t24;
- t13 = _mm_unpackhi_ps(x1,y1);
- x1 = _mm_unpacklo_ps(x1,y1);
- t14 = _mm_unpackhi_ps(z1,x2);
- z1 = _mm_unpacklo_ps(z1,x2);
- t15 = _mm_unpackhi_ps(y2,z2);
- y2 = _mm_unpacklo_ps(y2,z2);
- t16 = _mm_unpackhi_ps(x3,y3);
- x3 = _mm_unpacklo_ps(x3,y3);
- t17 = _mm_unpackhi_ps(z3,x4);
- z3 = _mm_unpacklo_ps(z3,x4);
- t18 = _mm_unpackhi_ps(y4,z4);
- y4 = _mm_unpacklo_ps(y4,z4);
- t19 = _mm_movelh_ps(x1,z1);
- z1 = _mm_movehl_ps(z1,x1);
- t20 = _mm_movelh_ps(t13,t14);
- t14 = _mm_movehl_ps(t14,t13);
- t21 = _mm_movelh_ps(y2,x3);
- x3 = _mm_movehl_ps(x3,y2);
- t22 = _mm_movelh_ps(t15,t16);
- t16 = _mm_movehl_ps(t16,t15);
- t23 = _mm_movelh_ps(z3,y4);
- y4 = _mm_movehl_ps(y4,z3);
- t24 = _mm_movelh_ps(t17,t18);
- t18 = _mm_movehl_ps(t18,t17);
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_loadu_ps(ptrA+8);
- t1 = _mm_add_ps(t1,t19);
- t2 = _mm_add_ps(t2,t21);
- t3 = _mm_add_ps(t3,t23);
- _mm_storeu_ps(ptrA,t1);
- _mm_storeu_ps(ptrA+4,t2);
- _mm_storeu_ps(ptrA+8,t3);
- t4 = _mm_loadu_ps(ptrB);
- t5 = _mm_loadu_ps(ptrB+4);
- t6 = _mm_loadu_ps(ptrB+8);
- t4 = _mm_add_ps(t4,z1);
- t5 = _mm_add_ps(t5,x3);
- t6 = _mm_add_ps(t6,y4);
- _mm_storeu_ps(ptrB,t4);
- _mm_storeu_ps(ptrB+4,t5);
- _mm_storeu_ps(ptrB+8,t6);
- t7 = _mm_loadu_ps(ptrC);
- t8 = _mm_loadu_ps(ptrC+4);
- t9 = _mm_loadu_ps(ptrC+8);
- t7 = _mm_add_ps(t7,t20);
- t8 = _mm_add_ps(t8,t22);
- t9 = _mm_add_ps(t9,t24);
- _mm_storeu_ps(ptrC,t7);
- _mm_storeu_ps(ptrC+4,t8);
- _mm_storeu_ps(ptrC+8,t9);
- t10 = _mm_loadu_ps(ptrD);
- t11 = _mm_loadu_ps(ptrD+4);
- t12 = _mm_loadu_ps(ptrD+8);
- t10 = _mm_add_ps(t10,t14);
- t11 = _mm_add_ps(t11,t16);
- t12 = _mm_add_ps(t12,t18);
- _mm_storeu_ps(ptrD,t10);
- _mm_storeu_ps(ptrD+4,t11);
- _mm_storeu_ps(ptrD+8,t12);
-}
-
-
-/* Routines to decrement rvec in memory */
-static void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float *ptrA, __m128 xyz)
-{
- __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
- __m128 t1;
-
- t1 = _mm_loadu_ps(ptrA);
- xyz = _mm_and_ps(mask,xyz);
- t1 = _mm_sub_ps(t1,xyz);
- _mm_storeu_ps(ptrA,t1);
-}
-
-
-static void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float *ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3)
-{
- __m128 t1,t2,t3,t4;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_load_ss(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
- t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
-
- t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
-
- tA = _mm_sub_ps(tA,t3);
- tB = _mm_sub_ps(tB,t4);
- tC = _mm_sub_ss(tC,t2);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_store_ss(ptrA+8,tC);
-}
-
-static void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float *ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
-{
- __m128 t1,t2,t3,t4,t5;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_loadu_ps(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_shuffle_ps(t1,xyz1,_MM_SHUFFLE(2,2,3,3)); /* z1 z1 x2 x2 */
- t3 = _mm_shuffle_ps(xyz1,t3,_MM_SHUFFLE(0,2,1,0)); /* x2 z1 y1 x1 */
-
- t4 = _mm_shuffle_ps(t1,t2,_MM_SHUFFLE(3,2,1,0)); /* y3 x3 z2 y2 */
-
- t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
- t2 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(1,1,0,0)); /* x4 x4 z3 z3 */
- t5 = _mm_shuffle_ps(t2,t5,_MM_SHUFFLE(3,2,2,0)); /* z4 y4 x4 z3 */
-
- tA = _mm_sub_ps(tA,t3);
- tB = _mm_sub_ps(tB,t4);
- tC = _mm_sub_ps(tC,t5);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_storeu_ps(ptrA+8,tC);
-
-}
-
-
static void
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
static void
-gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3)
static void
-gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float *ptrA, float *ptrB, float *ptrC, float *ptrD,
+gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
+ float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 x3, __m128 y3, __m128 z3,
static gmx_inline void
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
- float *fptr,
- float *fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t3;
static gmx_inline void
gmx_mm_update_iforce_2atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
- float *fptr,
- float *fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t4;
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
- float *fptr,
- float *fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t3,t4;
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
__m128 fix4, __m128 fiy4, __m128 fiz4,
- float *fptr,
- float *fshiftptr)
+ float * gmx_restrict fptr,
+ float * gmx_restrict fshiftptr)
{
__m128 t1,t2,t3,t4,t5;
static void
-gmx_mm_update_1pot_ps(__m128 pot1, float *ptrA)
+gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_add_ps(pot1,_mm_movehl_ps(_mm_setzero_ps(),pot1));
pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
}
static void
-gmx_mm_update_2pot_ps(__m128 pot1, float *ptrA,
- __m128 pot2, float *ptrB)
+gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
+ __m128 pot2, float * gmx_restrict ptrB)
{
__m128 t1,t2;
t1 = _mm_movehl_ps(pot2,pot1);
static void
-gmx_mm_update_4pot_ps(__m128 pot1, float *ptrA,
- __m128 pot2, float *ptrB,
- __m128 pot3, float *ptrC,
- __m128 pot4, float *ptrD)
+gmx_mm_update_4pot_ps(__m128 pot1, float * gmx_restrict ptrA,
+ __m128 pot2, float * gmx_restrict ptrB,
+ __m128 pot3, float * gmx_restrict ptrC,
+ __m128 pot4, float * gmx_restrict ptrD)
{
_MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 73 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 74 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*74);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*74);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 61 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 62 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 159 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 162 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*162);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*162);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 139 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 142 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*142);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*142);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 417 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 426 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*426);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 373 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 382 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*382);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 185 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 189 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*189);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 165 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 169 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*169);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*169);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*456);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*412);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*57);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 47 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 48 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*48);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*48);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 142 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 145 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 125 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 128 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*128);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*128);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 400 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 409 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*409);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 368 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*368);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 161 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 164 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*164);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 144 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 147 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*147);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*431);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*390);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 43 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*44);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 39 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 40 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*40);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 117 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 387 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 396 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*396);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 117 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 387 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 396 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*396);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 63 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 64 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*64);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*64);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 54 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 55 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*55);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 119 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 109 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 287 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 271 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 140 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 141 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 130 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*312);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*312);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 40 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 40 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*40);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 34 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 34 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*34);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 88 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 88 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*88);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 264 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 264 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*264);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*264);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 116 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 116 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*116);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*116);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*108);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*287);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*287);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 28 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 28 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*28);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*28);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 84 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 84 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 252 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 252 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*252);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*252);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 84 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 84 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 252 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 252 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*252);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*252);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 64 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 65 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*65);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*65);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 46 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 47 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*47);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*47);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 156 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 159 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*159);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*159);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 124 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 127 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*127);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*127);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 432 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 441 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*441);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 358 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 367 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*367);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 179 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 182 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*182);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 147 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 150 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*150);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*467);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*393);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 46 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 47 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*47);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*47);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 39 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 40 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*40);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 138 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 141 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 117 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 120 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 414 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 423 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*423);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 138 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 141 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 117 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 120 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 414 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 423 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*423);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 83 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 84 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 77 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 78 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*78);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*78);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 213 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 216 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*216);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*216);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 201 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 204 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*204);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*204);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 603 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 612 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*612);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 573 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 582 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*582);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 254 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 258 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*258);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*258);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 242 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 246 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*246);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*246);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 65 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 66 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*66);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*66);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 62 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 63 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*63);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*63);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 195 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 198 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*198);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*198);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 186 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 189 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*189);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 585 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 594 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*594);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 558 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 567 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*567);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 195 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 198 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*198);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 186 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 189 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*189);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 585 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 594 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*594);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 558 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 567 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*567);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 75 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 76 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*76);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*76);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 62 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 63 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*63);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 157 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 160 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*160);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*160);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 134 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 137 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*137);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 403 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 412 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*412);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 350 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*359);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 179 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 183 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*183);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*183);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 156 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 160 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*160);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*160);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*438);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*438);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*385);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*385);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 53 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 54 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 43 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*44);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 135 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 138 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*138);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 115 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 118 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*118);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 381 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 390 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*390);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 331 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 340 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*340);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 155 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 158 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*158);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*158);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 135 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 138 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*138);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*138);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*413);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*363);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 41 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 42 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*42);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*42);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 36 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 37 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*37);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*37);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 123 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 126 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 111 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*111);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*111);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 369 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 378 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*378);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 333 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*333);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 123 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 126 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 111 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*111);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*111);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 369 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 378 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*378);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 333 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*333);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 92 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 93 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 13 flops */
+ /* Outer loop uses 10 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*93);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*93);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 82 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 83 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*83);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*83);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 71 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 72 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 13 flops */
+ /* Outer loop uses 10 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*72);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*72);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 64 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 65 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*65);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
__m128i vfitab;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 58 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 59 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*12 + inneriter*59);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*59);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
__m128i vfitab;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*57);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*57);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 48 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 49 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*49);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*49);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 41 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 41 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*41);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*41);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 30 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 30 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*30);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*30);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 59 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 60 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*60);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*60);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 56 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 57 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*57);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 32 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 32 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*32);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*32);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 72 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 73 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*73);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 57 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 58 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*58);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*58);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 144 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 145 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 117 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 118 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*118);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 361 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*361);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*361);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 297 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 298 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*298);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 164 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 165 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*165);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*165);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 138 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 139 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*139);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*139);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
/* Calculate table index by multiplying r with table scale and truncate to integer */
vvdw = _mm_add_ps(vvdw12,vvdw6);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
/* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
vvdwsum = _mm_add_ps(vvdwsum,vvdw);
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
/* Calculate temporary vectorial force */
tx = _mm_mul_ps(fscal,dx00);
ty = _mm_mul_ps(fscal,dy00);
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
/**************************
* CALCULATE INTERACTIONS *
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 383 flops */
+ /* Inner loop uses 387 flops */
}
if(jidx<j_index_end)
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
r00 = _mm_andnot_ps(dummy_mask,r00);
vvdw = _mm_add_ps(vvdw12,vvdw6);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
/* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
vvdw = _mm_andnot_ps(dummy_mask,vvdw);
vvdwsum = _mm_add_ps(vvdwsum,vvdw);
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
fscal = _mm_andnot_ps(dummy_mask,fscal);
/* Calculate temporary vectorial force */
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
/**************************
* CALCULATE INTERACTIONS *
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 384 flops */
+ /* Inner loop uses 388 flops */
}
/* End of innermost loop */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
/* Calculate table index by multiplying r with table scale and truncate to integer */
fvdw12 = _mm_mul_ps(c12_00,FF);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
/* Calculate temporary vectorial force */
tx = _mm_mul_ps(fscal,dx00);
ty = _mm_mul_ps(fscal,dy00);
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
/**************************
* CALCULATE INTERACTIONS *
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 321 flops */
+ /* Inner loop uses 324 flops */
}
if(jidx<j_index_end)
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
r00 = _mm_andnot_ps(dummy_mask,r00);
fvdw12 = _mm_mul_ps(c12_00,FF);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
fscal = _mm_andnot_ps(dummy_mask,fscal);
/* Calculate temporary vectorial force */
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+
+ }
/**************************
* CALCULATE INTERACTIONS *
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 322 flops */
+ /* Inner loop uses 325 flops */
}
/* End of innermost loop */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 54 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 54 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 37 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 37 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*37);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 126 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 126 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 97 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 97 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*97);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*97);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 342 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 342 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 277 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 277 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 149 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 149 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*149);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*149);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 120 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 120 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*368);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*303);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 70 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 71 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*71);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*71);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 61 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 62 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 142 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 143 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*143);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 121 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 122 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*122);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*122);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 358 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 301 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 302 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 167 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 168 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*168);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*168);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 146 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 147 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*147);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 36 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 36 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*36);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*36);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 30 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 30 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*30);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 108 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 108 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 90 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 90 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*90);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*90);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*270);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 108 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 108 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 90 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/**************************
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
}
/* Inner loop uses 90 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*90);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*90);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 270 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*270);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 67 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 68 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*68);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*68);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 54 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 55 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*55);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 131 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 109 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 323 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 271 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 152 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 153 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*153);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*153);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 130 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*348);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*348);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 44 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*44);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 34 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 34 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*34);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 88 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 88 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*88);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 300 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 300 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*300);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 128 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 128 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*128);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*128);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*108);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*323);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 32 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 32 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*32);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*32);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
-
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 288 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
fiz1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
+
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
/* #for I in PARTICLES_I */
int vdwioffset{I};
/* #endif */
/* #if 'GeneralizedBorn' in KERNEL_ELEC */
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
/* #endif */
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- /* ## Loop over i particles, but only include ones that we use - skip e.g. vdw-only sites for elec-only kernel */
- /* #for I in PARTICLES_I */
- ix{I} = _mm_set1_ps(shX + x[i_coord_offset+DIM*{I}+XX]);
- iy{I} = _mm_set1_ps(shY + x[i_coord_offset+DIM*{I}+YY]);
- iz{I} = _mm_set1_ps(shZ + x[i_coord_offset+DIM*{I}+ZZ]);
- /* #define OUTERFLOPS OUTERFLOPS+3 */
- /* #endfor */
-
+ /* #if GEOMETRY_I == 'Particle' */
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+ /* #elif GEOMETRY_I == 'Water3' */
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+ /* #elif GEOMETRY_I == 'Water4' */
+ /* #if 0 in PARTICLES_I */
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+ /* #else */
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+ /* #endif */
+ /* #endif */
+
/* #if 'Force' in KERNEL_VF */
/* #for I in PARTICLES_I */
fix{I} = _mm_setzero_ps();
/* #define INNERFLOPS 0 */
/* Get j neighbor index, and coordinate index */
+ /* #if ROUND =='Loop' */
jnrA = jjnr[jidx];
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
- /* #if ROUND =='Epilogue' */
+ /* #else */
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
/* #endif */
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
* CALCULATE INTERACTIONS *
**************************/
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
{
isaprod = _mm_mul_ps(isai{I},isaj{J});
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+{J},dvda+jnrB+{J},dvda+jnrC+{J},dvda+jnrD+{J});
/* #define INNERFLOPS INNERFLOPS+5 */
/* Calculate generalized born table index - this is a separate table from the normal one,
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
+ /* #if ROUND == 'Loop' */
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ /* #else */
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ /* #endif */
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
/* #define INNERFLOPS INNERFLOPS+13 */
/* #endif */
velec = _mm_mul_ps(qq{I}{J},rinv{I}{J});
/* #endif */
/* #endif */
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
cutoff_mask = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
/* #define INNERFLOPS INNERFLOPS+1 */
/* #endif */
/* #endif */
/* #endif */
/* #if 'vdw' in INTERACTION_FLAGS[I][J] */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
vvdw = _mm_and_ps(vvdw,cutoff_mask);
/* #define INNERFLOPS INNERFLOPS+1 */
/* #endif */
fscal = fvdw;
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
fscal = _mm_and_ps(fscal,cutoff_mask);
- /* #define INNERFLOPS INNERFLOPS+1 */
- /* #endif */
+ /* #define INNERFLOPS INNERFLOPS+1 */
+ /* #endif */
/* #if ROUND == 'Epilogue' */
fscal = _mm_andnot_ps(dummy_mask,fscal);
/* #define INNERFLOPS INNERFLOPS+6 */
/* #if GEOMETRY_J == 'Particle' */
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ /* #if ROUND == 'Loop' */
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ /* #else */
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ /* #endif */
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* #define INNERFLOPS INNERFLOPS+3 */
/* #else */
fjx{J} = _mm_add_ps(fjx{J},tx);
fjz{J} = _mm_add_ps(fjz{J},tz);
/* #define INNERFLOPS INNERFLOPS+3 */
/* #endif */
-
+
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* #if 0 ## This and next two lines is a hack to maintain indentation in template file */
{
/* #endif */
/* #endfor */
/* ## End of loop over i-j interaction pairs */
+ /* #if GEOMETRY_J != 'Particle' */
+ /* #if ROUND == 'Loop' */
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ /* #else */
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ /* #endif */
+ /* #endif */
+
/* #if GEOMETRY_J == 'Water3' */
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* #define INNERFLOPS INNERFLOPS+9 */
/* #elif GEOMETRY_J == 'Water4' */
/* #if 0 in PARTICLES_J */
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* #define INNERFLOPS INNERFLOPS+12 */
/* #else */
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* #define INNERFLOPS INNERFLOPS+9 */
/* #endif */
#include "gmx_x86_sse4_1.h"
+#undef gmx_restrict
+#define gmx_restrict
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0,t1,t2,t3) _mm_add_ps(_mm_add_ps(t0,t1),_mm_add_ps(t2,t3))
*c12 = _mm_movehl_ps(t2,t1);
}
-/* Routines to load 1-4 rvec from 4 places.
- * We mainly use these to load coordinates. The extra routines
- * are very efficient for the water-water loops, since we e.g.
- * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
- */
-
static gmx_inline void
-gmx_mm_load_1rvec_broadcast_ps(const float * gmx_restrict ptrA,
- __m128 * gmx_restrict x,
- __m128 * gmx_restrict y,
- __m128 * gmx_restrict z)
+gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1,
+ __m128 * gmx_restrict y1,
+ __m128 * gmx_restrict z1)
{
- __m128 t1;
+ __m128 t1,t2,t3,t4;
- t1 = _mm_loadu_ps(ptrA);
+ t1 = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ t2 = _mm_castpd_ps(_mm_load_sd((const double *)xyz));
+ t3 = _mm_load_ss(xyz_shift+2);
+ t4 = _mm_load_ss(xyz+2);
+ t1 = _mm_add_ps(t1,t2);
+ t3 = _mm_add_ss(t3,t4);
- *x = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
}
+
static gmx_inline void
-gmx_mm_load_3rvec_broadcast_ps(const float * gmx_restrict ptrA,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
+gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
- __m128 t1,t2,t3;
-
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
-
- *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
- *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
- *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
- *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
- *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
- *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
-
- t3 = _mm_load_ss(ptrA+8);
- *z3 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ __m128 tA,tB;
+ __m128 t1,t2,t3,t4,t5,t6;
+
+ tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ tB = _mm_load_ss(xyz_shift+2);
+
+ t1 = _mm_loadu_ps(xyz);
+ t2 = _mm_loadu_ps(xyz+4);
+ t3 = _mm_load_ss(xyz+8);
+
+ tA = _mm_movelh_ps(tA,tB);
+ t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
+ t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
+ t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
+
+ t1 = _mm_add_ps(t1,t4);
+ t2 = _mm_add_ps(t2,t5);
+ t3 = _mm_add_ss(t3,t6);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+ *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
}
+
static gmx_inline void
-gmx_mm_load_4rvec_broadcast_ps(const float * gmx_restrict ptrA,
- __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
- __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
- __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
- __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
+gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
+ const float * gmx_restrict xyz,
+ __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
+ __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
+ __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
+ __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
- __m128 t1,t2,t3;
- __m128 tA;
-
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_loadu_ps(ptrA+8);
-
- *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
- *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
- *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
- *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
- *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
- *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
- *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
- *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
- *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
- *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
- *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
- *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
+ __m128 tA,tB;
+ __m128 t1,t2,t3,t4,t5,t6;
+
+ tA = _mm_castpd_ps(_mm_load_sd((const double *)xyz_shift));
+ tB = _mm_load_ss(xyz_shift+2);
+
+ t1 = _mm_loadu_ps(xyz);
+ t2 = _mm_loadu_ps(xyz+4);
+ t3 = _mm_loadu_ps(xyz+8);
+
+ tA = _mm_movelh_ps(tA,tB);
+ t4 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(0,2,1,0));
+ t5 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(1,0,2,1));
+ t6 = _mm_shuffle_ps(tA,tA,_MM_SHUFFLE(2,1,0,2));
+
+ t1 = _mm_add_ps(t1,t4);
+ t2 = _mm_add_ps(t2,t5);
+ t3 = _mm_add_ps(t3,t6);
+
+ *x1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(0,0,0,0));
+ *y1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(1,1,1,1));
+ *z1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(2,2,2,2));
+ *x2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,3,3));
+ *y2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,0));
+ *z2 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(1,1,1,1));
+ *x3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(2,2,2,2));
+ *y3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(3,3,3,3));
+ *z3 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,0));
+ *x4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(1,1,1,1));
+ *y4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(2,2,2,2));
+ *z4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(3,3,3,3));
}
}
-/* Routines to increment rvec in memory, typically use for j particle force updates */
-static gmx_inline void
-gmx_mm_increment_1rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA, __m128 xyz)
-{
- __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
- __m128 t1;
-
- t1 = _mm_loadu_ps(ptrA);
- xyz = _mm_and_ps(mask,xyz);
- t1 = _mm_add_ps(t1,xyz);
- _mm_storeu_ps(ptrA,t1);
-}
-
-
-static gmx_inline void
-gmx_mm_increment_3rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3)
-{
- __m128 t1,t2,t3,t4;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_load_ss(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_blend_ps(t1,xyz1,_GMX_MM_BLEND(0,1,1,1)); /* x2 z1 y1 x1 */
- t4 = _mm_blend_ps(t1,t2,_GMX_MM_BLEND(1,1,0,0)); /* y3 x3 z2 y2 */
-
- tA = _mm_add_ps(tA,t3);
- tB = _mm_add_ps(tB,t4);
- tC = _mm_add_ss(tC,t2);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_store_ss(ptrA+8,tC);
-}
-
-static gmx_inline void
-gmx_mm_increment_4rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
-{
- __m128 t1,t2,t3,t4,t5;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_loadu_ps(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_blend_ps(t1,xyz1,_GMX_MM_BLEND(0,1,1,1)); /* x2 z1 y1 x1 */
- t4 = _mm_blend_ps(t1,t2,_GMX_MM_BLEND(1,1,0,0)); /* y3 x3 z2 y2 */
- t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
- t5 = _mm_blend_ps(t5,t2,_GMX_MM_BLEND(0,0,0,1)); /* z4 y4 x4 z3 */
-
- tA = _mm_add_ps(tA,t3);
- tB = _mm_add_ps(tB,t4);
- tC = _mm_add_ps(tC,t5);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_storeu_ps(ptrA+8,tC);
-}
-
-
static gmx_inline void
-gmx_mm_increment_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC,float * gmx_restrict ptrD,
- __m128 x1, __m128 y1, __m128 z1)
+gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
+ float * ptrB,
+ float * ptrC,
+ float * ptrD,
+ __m128 x1, __m128 y1, __m128 z1)
{
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
t5 = _mm_unpacklo_ps(y1,z1);
t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
t1 = _mm_load_ss(ptrA);
t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
- t1 = _mm_add_ps(t1,t7);
+ t1 = _mm_sub_ps(t1,t7);
_mm_store_ss(ptrA,t1);
_mm_storeh_pi((__m64 *)(ptrA+1),t1);
t2 = _mm_load_ss(ptrB);
t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
- t2 = _mm_add_ps(t2,t8);
+ t2 = _mm_sub_ps(t2,t8);
_mm_store_ss(ptrB,t2);
_mm_storeh_pi((__m64 *)(ptrB+1),t2);
t3 = _mm_load_ss(ptrC);
t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
- t3 = _mm_add_ps(t3,t9);
+ t3 = _mm_sub_ps(t3,t9);
_mm_store_ss(ptrC,t3);
_mm_storeh_pi((__m64 *)(ptrC+1),t3);
t4 = _mm_load_ss(ptrD);
t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
- t4 = _mm_add_ps(t4,t10);
+ t4 = _mm_sub_ps(t4,t10);
_mm_store_ss(ptrD,t4);
_mm_storeh_pi((__m64 *)(ptrD+1),t4);
}
-
static gmx_inline void
-gmx_mm_increment_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
+gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
__m128 x2, __m128 y2, __m128 z2,
__m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
__m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
__m128 t20,t21,t22,t23,t24,t25;
-
+
t13 = _mm_unpackhi_ps(x1,y1);
x1 = _mm_unpacklo_ps(x1,y1);
t14 = _mm_unpackhi_ps(z1,x2);
t1 = _mm_loadu_ps(ptrA);
t2 = _mm_loadu_ps(ptrA+4);
t3 = _mm_load_ss(ptrA+8);
- t1 = _mm_add_ps(t1,t20);
- t2 = _mm_add_ps(t2,t23);
- t3 = _mm_add_ss(t3,z3);
- _mm_storeu_ps(ptrA,t1);
- _mm_storeu_ps(ptrA+4,t2);
- _mm_store_ss(ptrA+8,t3);
t4 = _mm_loadu_ps(ptrB);
t5 = _mm_loadu_ps(ptrB+4);
t6 = _mm_load_ss(ptrB+8);
- t4 = _mm_add_ps(t4,t21);
- t5 = _mm_add_ps(t5,t24);
- t6 = _mm_add_ss(t6,t17);
- _mm_storeu_ps(ptrB,t4);
- _mm_storeu_ps(ptrB+4,t5);
- _mm_store_ss(ptrB+8,t6);
t7 = _mm_loadu_ps(ptrC);
t8 = _mm_loadu_ps(ptrC+4);
t9 = _mm_load_ss(ptrC+8);
- t7 = _mm_add_ps(t7,t22);
- t8 = _mm_add_ps(t8,t25);
- t9 = _mm_add_ss(t9,t18);
- _mm_storeu_ps(ptrC,t7);
- _mm_storeu_ps(ptrC+4,t8);
- _mm_store_ss(ptrC+8,t9);
t10 = _mm_loadu_ps(ptrD);
t11 = _mm_loadu_ps(ptrD+4);
t12 = _mm_load_ss(ptrD+8);
- t10 = _mm_add_ps(t10,t14);
- t11 = _mm_add_ps(t11,t16);
- t12 = _mm_add_ss(t12,t19);
- _mm_storeu_ps(ptrD,t10);
- _mm_storeu_ps(ptrD+4,t11);
- _mm_store_ss(ptrD+8,t12);
-}
-
-
-static gmx_inline void
-gmx_mm_increment_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m128 x1, __m128 y1, __m128 z1,
- __m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3,
- __m128 x4, __m128 y4, __m128 z4)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11;
- __m128 t12,t13,t14,t15,t16,t17,t18,t19,t20,t21,t22;
- __m128 t23,t24;
- t13 = _mm_unpackhi_ps(x1,y1);
- x1 = _mm_unpacklo_ps(x1,y1);
- t14 = _mm_unpackhi_ps(z1,x2);
- z1 = _mm_unpacklo_ps(z1,x2);
- t15 = _mm_unpackhi_ps(y2,z2);
- y2 = _mm_unpacklo_ps(y2,z2);
- t16 = _mm_unpackhi_ps(x3,y3);
- x3 = _mm_unpacklo_ps(x3,y3);
- t17 = _mm_unpackhi_ps(z3,x4);
- z3 = _mm_unpacklo_ps(z3,x4);
- t18 = _mm_unpackhi_ps(y4,z4);
- y4 = _mm_unpacklo_ps(y4,z4);
- t19 = _mm_movelh_ps(x1,z1);
- z1 = _mm_movehl_ps(z1,x1);
- t20 = _mm_movelh_ps(t13,t14);
- t14 = _mm_movehl_ps(t14,t13);
- t21 = _mm_movelh_ps(y2,x3);
- x3 = _mm_movehl_ps(x3,y2);
- t22 = _mm_movelh_ps(t15,t16);
- t16 = _mm_movehl_ps(t16,t15);
- t23 = _mm_movelh_ps(z3,y4);
- y4 = _mm_movehl_ps(y4,z3);
- t24 = _mm_movelh_ps(t17,t18);
- t18 = _mm_movehl_ps(t18,t17);
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_loadu_ps(ptrA+8);
- t1 = _mm_add_ps(t1,t19);
- t2 = _mm_add_ps(t2,t21);
- t3 = _mm_add_ps(t3,t23);
- _mm_storeu_ps(ptrA,t1);
- _mm_storeu_ps(ptrA+4,t2);
- _mm_storeu_ps(ptrA+8,t3);
- t4 = _mm_loadu_ps(ptrB);
- t5 = _mm_loadu_ps(ptrB+4);
- t6 = _mm_loadu_ps(ptrB+8);
- t4 = _mm_add_ps(t4,z1);
- t5 = _mm_add_ps(t5,x3);
- t6 = _mm_add_ps(t6,y4);
- _mm_storeu_ps(ptrB,t4);
- _mm_storeu_ps(ptrB+4,t5);
- _mm_storeu_ps(ptrB+8,t6);
- t7 = _mm_loadu_ps(ptrC);
- t8 = _mm_loadu_ps(ptrC+4);
- t9 = _mm_loadu_ps(ptrC+8);
- t7 = _mm_add_ps(t7,t20);
- t8 = _mm_add_ps(t8,t22);
- t9 = _mm_add_ps(t9,t24);
- _mm_storeu_ps(ptrC,t7);
- _mm_storeu_ps(ptrC+4,t8);
- _mm_storeu_ps(ptrC+8,t9);
- t10 = _mm_loadu_ps(ptrD);
- t11 = _mm_loadu_ps(ptrD+4);
- t12 = _mm_loadu_ps(ptrD+8);
- t10 = _mm_add_ps(t10,t14);
- t11 = _mm_add_ps(t11,t16);
- t12 = _mm_add_ps(t12,t18);
- _mm_storeu_ps(ptrD,t10);
- _mm_storeu_ps(ptrD+4,t11);
- _mm_storeu_ps(ptrD+8,t12);
-}
-
-
-/* Routines to decrement rvec in memory */
-static gmx_inline void
-gmx_mm_decrement_1rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA, __m128 xyz)
-{
- __m128 mask = gmx_mm_castsi128_ps( _mm_set_epi32(0,-1,-1,-1) );
- __m128 t1;
-
- t1 = _mm_loadu_ps(ptrA);
- xyz = _mm_and_ps(mask,xyz);
- t1 = _mm_sub_ps(t1,xyz);
- _mm_storeu_ps(ptrA,t1);
-}
-
-
-static gmx_inline void
-gmx_mm_decrement_3rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3)
-{
- __m128 t1,t2,t3,t4;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_load_ss(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_blend_ps(t1,xyz1,_GMX_MM_BLEND(0,1,1,1)); /* x2 z1 y1 x1 */
- t4 = _mm_blend_ps(t1,t2,_GMX_MM_BLEND(1,1,0,0)); /* y3 x3 z2 y2 */
-
- tA = _mm_sub_ps(tA,t3);
- tB = _mm_sub_ps(tB,t4);
- tC = _mm_sub_ss(tC,t2);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_store_ss(ptrA+8,tC);
-}
-
-static gmx_inline void
-gmx_mm_decrement_4rvec_1ptr_noswizzle_ps(float * gmx_restrict ptrA,
- __m128 xyz1, __m128 xyz2, __m128 xyz3, __m128 xyz4)
-{
- __m128 t1,t2,t3,t4,t5;
- __m128 tA,tB,tC;
-
- tA = _mm_loadu_ps(ptrA);
- tB = _mm_loadu_ps(ptrA+4);
- tC = _mm_loadu_ps(ptrA+8);
-
- t1 = _mm_shuffle_ps(xyz2,xyz2,_MM_SHUFFLE(0,0,2,1)); /* x2 - z2 y2 */
- t2 = _mm_shuffle_ps(xyz3,xyz3,_MM_SHUFFLE(1,0,0,2)); /* y3 x3 - z3 */
-
- t3 = _mm_blend_ps(t1,xyz1,_GMX_MM_BLEND(0,1,1,1)); /* x2 z1 y1 x1 */
- t4 = _mm_blend_ps(t1,t2,_GMX_MM_BLEND(1,1,0,0)); /* y3 x3 z2 y2 */
- t5 = _mm_shuffle_ps(xyz4,xyz4,_MM_SHUFFLE(2,1,0,0)); /* z4 y4 x4 - */
- t5 = _mm_blend_ps(t5,t2,_GMX_MM_BLEND(0,0,0,1)); /* z4 y4 x4 z3 */
- tA = _mm_sub_ps(tA,t3);
- tB = _mm_sub_ps(tB,t4);
- tC = _mm_sub_ps(tC,t5);
-
- _mm_storeu_ps(ptrA,tA);
- _mm_storeu_ps(ptrA+4,tB);
- _mm_storeu_ps(ptrA+8,tC);
-}
-
-
-static gmx_inline void
-gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
- float * gmx_restrict ptrB,
- float * gmx_restrict ptrC,
- float * gmx_restrict ptrD,
- __m128 x1, __m128 y1, __m128 z1)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
- t5 = _mm_unpacklo_ps(y1,z1);
- t6 = _mm_unpackhi_ps(y1,z1);
- t7 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(1,0,0,0));
- t8 = _mm_shuffle_ps(x1,t5,_MM_SHUFFLE(3,2,0,1));
- t9 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(1,0,0,2));
- t10 = _mm_shuffle_ps(x1,t6,_MM_SHUFFLE(3,2,0,3));
- t1 = _mm_load_ss(ptrA);
- t1 = _mm_loadh_pi(t1,(__m64 *)(ptrA+1));
- t1 = _mm_sub_ps(t1,t7);
- _mm_store_ss(ptrA,t1);
- _mm_storeh_pi((__m64 *)(ptrA+1),t1);
- t2 = _mm_load_ss(ptrB);
- t2 = _mm_loadh_pi(t2,(__m64 *)(ptrB+1));
- t2 = _mm_sub_ps(t2,t8);
- _mm_store_ss(ptrB,t2);
- _mm_storeh_pi((__m64 *)(ptrB+1),t2);
- t3 = _mm_load_ss(ptrC);
- t3 = _mm_loadh_pi(t3,(__m64 *)(ptrC+1));
- t3 = _mm_sub_ps(t3,t9);
- _mm_store_ss(ptrC,t3);
- _mm_storeh_pi((__m64 *)(ptrC+1),t3);
- t4 = _mm_load_ss(ptrD);
- t4 = _mm_loadh_pi(t4,(__m64 *)(ptrD+1));
- t4 = _mm_sub_ps(t4,t10);
- _mm_store_ss(ptrD,t4);
- _mm_storeh_pi((__m64 *)(ptrD+1),t4);
-}
-
-
-
-static gmx_inline void
-gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
- float * gmx_restrict ptrC, float * gmx_restrict ptrD,
- __m128 x1, __m128 y1, __m128 z1,
- __m128 x2, __m128 y2, __m128 z2,
- __m128 x3, __m128 y3, __m128 z3)
-{
- __m128 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
- __m128 t11,t12,t13,t14,t15,t16,t17,t18,t19;
- __m128 t20,t21,t22,t23,t24,t25;
-
- t13 = _mm_unpackhi_ps(x1,y1);
- x1 = _mm_unpacklo_ps(x1,y1);
- t14 = _mm_unpackhi_ps(z1,x2);
- z1 = _mm_unpacklo_ps(z1,x2);
- t15 = _mm_unpackhi_ps(y2,z2);
- y2 = _mm_unpacklo_ps(y2,z2);
- t16 = _mm_unpackhi_ps(x3,y3);
- x3 = _mm_unpacklo_ps(x3,y3);
- t17 = _mm_shuffle_ps(z3,z3,_MM_SHUFFLE(0,0,0,1));
- t18 = _mm_movehl_ps(z3,z3);
- t19 = _mm_shuffle_ps(t18,t18,_MM_SHUFFLE(0,0,0,1));
- t20 = _mm_movelh_ps(x1,z1);
- t21 = _mm_movehl_ps(z1,x1);
- t22 = _mm_movelh_ps(t13,t14);
- t14 = _mm_movehl_ps(t14,t13);
- t23 = _mm_movelh_ps(y2,x3);
- t24 = _mm_movehl_ps(x3,y2);
- t25 = _mm_movelh_ps(t15,t16);
- t16 = _mm_movehl_ps(t16,t15);
- t1 = _mm_loadu_ps(ptrA);
- t2 = _mm_loadu_ps(ptrA+4);
- t3 = _mm_load_ss(ptrA+8);
t1 = _mm_sub_ps(t1,t20);
t2 = _mm_sub_ps(t2,t23);
t3 = _mm_sub_ss(t3,z3);
_mm_storeu_ps(ptrA,t1);
_mm_storeu_ps(ptrA+4,t2);
_mm_store_ss(ptrA+8,t3);
- t4 = _mm_loadu_ps(ptrB);
- t5 = _mm_loadu_ps(ptrB+4);
- t6 = _mm_load_ss(ptrB+8);
t4 = _mm_sub_ps(t4,t21);
t5 = _mm_sub_ps(t5,t24);
t6 = _mm_sub_ss(t6,t17);
_mm_storeu_ps(ptrB,t4);
_mm_storeu_ps(ptrB+4,t5);
_mm_store_ss(ptrB+8,t6);
- t7 = _mm_loadu_ps(ptrC);
- t8 = _mm_loadu_ps(ptrC+4);
- t9 = _mm_load_ss(ptrC+8);
t7 = _mm_sub_ps(t7,t22);
t8 = _mm_sub_ps(t8,t25);
t9 = _mm_sub_ss(t9,t18);
_mm_storeu_ps(ptrC,t7);
_mm_storeu_ps(ptrC+4,t8);
_mm_store_ss(ptrC+8,t9);
- t10 = _mm_loadu_ps(ptrD);
- t11 = _mm_loadu_ps(ptrD+4);
- t12 = _mm_load_ss(ptrD+8);
t10 = _mm_sub_ps(t10,t14);
t11 = _mm_sub_ps(t11,t16);
t12 = _mm_sub_ss(t12,t19);
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 73 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 74 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*74);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*74);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 61 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 62 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 159 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 162 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*162);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*162);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 139 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 142 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*142);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*142);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 417 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 426 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*426);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 373 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 382 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*382);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 185 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 189 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*189);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 165 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 169 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*169);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*169);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*456);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*412);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*57);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 47 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 48 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*48);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*48);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 142 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 145 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 125 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 128 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*128);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*128);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 400 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 409 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*409);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 368 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*368);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 161 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 164 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*164);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 144 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 147 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*147);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*431);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*390);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 43 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*44);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 39 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 40 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*40);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 117 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 387 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 396 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*396);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 117 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 387 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 396 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*396);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 63 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 64 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*64);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*64);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 54 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 55 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*55);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 119 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 120 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 109 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 287 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 271 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 140 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 141 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 130 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*312);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*312);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 40 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 40 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*40);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 34 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 34 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*34);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 88 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 88 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*88);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 264 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 264 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*264);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*264);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 116 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 116 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*116);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*116);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*108);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*287);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*287);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 28 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 28 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*28);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*28);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 84 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 84 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 252 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 252 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*252);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*252);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 84 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 84 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 252 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 252 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*252);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*252);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*65);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*65);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*47);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*47);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*159);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*159);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*127);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*127);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 432 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 441 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*441);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 358 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 367 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*367);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*182);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*150);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*467);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*393);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*47);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*47);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*40);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*40);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 414 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 423 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*423);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*141);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*141);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 414 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 423 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*423);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 351 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 360 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*84);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*84);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*78);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*78);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*216);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*216);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*204);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*204);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 603 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 612 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*612);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 573 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 582 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*582);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*258);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*258);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*246);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*246);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*66);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*66);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*63);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*63);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*198);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*198);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*189);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 585 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 594 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*594);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 558 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 567 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*567);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*198);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*189);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*189);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 585 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 594 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*594);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 558 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 567 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*567);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 75 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 76 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*76);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*76);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 62 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 63 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*63);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*63);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 157 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 160 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*160);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*160);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 134 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 137 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*137);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*137);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 403 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 412 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*412);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 350 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*359);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 179 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 183 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*183);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*183);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 156 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 160 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*160);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*160);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*438);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*438);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*385);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*385);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 53 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 54 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 43 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*44);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 135 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 138 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*138);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*138);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 115 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 118 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*118);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 381 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 390 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*390);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 331 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 340 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*340);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 155 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 158 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*158);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*158);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 135 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 138 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*138);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*138);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*413);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*363);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 41 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 42 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*42);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*42);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 36 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 37 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*37);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*37);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 123 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 126 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 111 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*111);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*111);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 369 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 378 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*378);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 333 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*333);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 123 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 126 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 111 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*111);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*111);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 369 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 378 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*378);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*378);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 333 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*333);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*333);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 92 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 93 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 13 flops */
+ /* Outer loop uses 10 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*93);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*93);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 82 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 83 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*83);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*83);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 71 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 72 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 13 flops */
+ /* Outer loop uses 10 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*13 + inneriter*72);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*10 + inneriter*72);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
int nvdwtype;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 64 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 65 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*65);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*65);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
__m128i vfitab;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 58 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 59 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*12 + inneriter*59);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*9 + inneriter*59);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecGB_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
__m128 velec,felec,velecsum,facel,crf,krf,krf2;
real *charge;
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
__m128i vfitab;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
isaprod = _mm_mul_ps(isai0,isaj0);
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq00,_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+0,dvda+jnrB+0,dvda+jnrC+0,dvda+jnrD+0);
/* Calculate generalized born table index - this is a separate table from the normal one,
* but we use the same procedure by multiplying r with scale and truncating to integer.
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r00)));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj0,isaj0)));
velec = _mm_mul_ps(qq00,rinv00);
felec = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(velec,rinv00),fgb),rinv00);
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*57);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 56 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 57 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*57);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 48 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 49 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*49);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*49);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*41);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*41);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*30);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*30);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*60);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*60);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJSw_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*57);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*57);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 32 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 32 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*10 + inneriter*32);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*32);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 9 flops */
+ /* Outer loop uses 6 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*9 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*73);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*73);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*58);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*58);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*145);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*145);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*118);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*118);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 360 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 361 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*361);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*361);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 297 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 298 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*298);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*165);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*165);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*139);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*139);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
/* Calculate table index by multiplying r with table scale and truncate to integer */
vvdw = _mm_add_ps(vvdw12,vvdw6);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
/* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
vvdwsum = _mm_add_ps(vvdwsum,vvdw);
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
/* Calculate temporary vectorial force */
tx = _mm_mul_ps(fscal,dx00);
ty = _mm_mul_ps(fscal,dy00);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 383 flops */
+ /* Inner loop uses 387 flops */
}
if(jidx<j_index_end)
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
r00 = _mm_andnot_ps(dummy_mask,r00);
vvdw = _mm_add_ps(vvdw12,vvdw6);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
/* Update potential sum for this i atom from the interaction with this j atom. */
+ vvdw = _mm_and_ps(vvdw,cutoff_mask);
vvdw = _mm_andnot_ps(dummy_mask,vvdw);
vvdwsum = _mm_add_ps(vvdwsum,vvdw);
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
fscal = _mm_andnot_ps(dummy_mask,fscal);
/* Calculate temporary vectorial force */
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 384 flops */
+ /* Inner loop uses 388 flops */
}
/* End of innermost loop */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
/* Calculate table index by multiplying r with table scale and truncate to integer */
fvdw12 = _mm_mul_ps(c12_00,FF);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
/* Calculate temporary vectorial force */
tx = _mm_mul_ps(fscal,dx00);
ty = _mm_mul_ps(fscal,dy00);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 321 flops */
+ /* Inner loop uses 324 flops */
}
if(jidx<j_index_end)
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
* CALCULATE INTERACTIONS *
**************************/
+ if (gmx_mm_any_lt(rsq00,rcutoff2))
+ {
+
r00 = _mm_mul_ps(rsq00,rinv00);
r00 = _mm_andnot_ps(dummy_mask,r00);
fvdw12 = _mm_mul_ps(c12_00,FF);
fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
+ cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
+
fscal = fvdw;
+ fscal = _mm_and_ps(fscal,cutoff_mask);
+
fscal = _mm_andnot_ps(dummy_mask,fscal);
/* Calculate temporary vectorial force */
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
+ }
+
/**************************
* CALCULATE INTERACTIONS *
**************************/
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
- /* Inner loop uses 322 flops */
+ /* Inner loop uses 325 flops */
}
/* End of innermost loop */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*54);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*54);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*37);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*37);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*126);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*126);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*97);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*97);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 342 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 342 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*342);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 277 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 277 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*277);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*149);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*149);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*120);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*120);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*368);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*303);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*71);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*71);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*62);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*143);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*143);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*122);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*122);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 358 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 359 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*359);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*359);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 301 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 302 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*302);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*302);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*168);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*168);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*147);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*147);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*387);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*330);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*36);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*36);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*30);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*30);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*90);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*90);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*270);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*90);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*90);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
}
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 270 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*270);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*270);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 67 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 68 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*68);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*68);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 54 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 55 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*55);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*55);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 131 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 132 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*132);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*132);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 109 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*109);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*109);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 323 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 324 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*324);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 270 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 271 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*271);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 152 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 153 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*153);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*153);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 129 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 130 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*130);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*348);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*348);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*295);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 44 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 44 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 12 flops */
+ /* Outer loop uses 9 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*12 + inneriter*44);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*44);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 34 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 34 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*10 + inneriter*34);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*34);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*29 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*108);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 88 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 88 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*27 + inneriter*88);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*88);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 300 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 300 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 29 flops */
+ /* Outer loop uses 20 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*300);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*300);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 250 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*250);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*250);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 128 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 128 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*128);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*128);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 108 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*108);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*108);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*323);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*273);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 32 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 32 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 11 flops */
+ /* Outer loop uses 8 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*11 + inneriter*32);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VF,outeriter*8 + inneriter*32);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomP1P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 27 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 10 flops */
+ /* Outer loop uses 7 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*10 + inneriter*27);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_F,outeriter*7 + inneriter*27);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*28 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy0 = _mm_add_ps(fiy0,ty);
fiz0 = _mm_add_ps(fiz0,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 96 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*28 + inneriter*96);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*96);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4P1_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fiy1 = _mm_add_ps(fiy1,ty);
fiz1 = _mm_add_ps(fiz1,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy2 = _mm_add_ps(fiy2,ty);
fiz2 = _mm_add_ps(fiz2,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/**************************
* CALCULATE INTERACTIONS *
fiy3 = _mm_add_ps(fiy3,ty);
fiz3 = _mm_add_ps(fiz3,tz);
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* Inner loop uses 81 flops */
}
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*27 + inneriter*81);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*81);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 288 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 288 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 28 flops */
+ /* Outer loop uses 19 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*288);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*288);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW4W4_F_sse4_1_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset1;
__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
fix1 = _mm_setzero_ps();
fiy1 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Inner loop uses 243 flops */
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 27 flops */
+ /* Outer loop uses 18 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*243);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*243);
}
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
/* #for I in PARTICLES_I */
int vdwioffset{I};
/* #endif */
/* #if 'GeneralizedBorn' in KERNEL_ELEC */
__m128i gbitab;
- __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+ __m128 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
__m128 minushalf = _mm_set1_ps(-0.5);
real *invsqrta,*dvda,*gbtab;
/* #endif */
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- /* ## Loop over i particles, but only include ones that we use - skip e.g. vdw-only sites for elec-only kernel */
- /* #for I in PARTICLES_I */
- ix{I} = _mm_set1_ps(shX + x[i_coord_offset+DIM*{I}+XX]);
- iy{I} = _mm_set1_ps(shY + x[i_coord_offset+DIM*{I}+YY]);
- iz{I} = _mm_set1_ps(shZ + x[i_coord_offset+DIM*{I}+ZZ]);
- /* #define OUTERFLOPS OUTERFLOPS+3 */
- /* #endfor */
+ /* #if GEOMETRY_I == 'Particle' */
+ gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+ /* #elif GEOMETRY_I == 'Water3' */
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+ /* #elif GEOMETRY_I == 'Water4' */
+ /* #if 0 in PARTICLES_I */
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+ /* #else */
+ gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+ &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+ /* #endif */
+ /* #endif */
/* #if 'Force' in KERNEL_VF */
/* #for I in PARTICLES_I */
/* #define INNERFLOPS 0 */
/* Get j neighbor index, and coordinate index */
+ /* #if ROUND =='Loop' */
jnrA = jjnr[jidx];
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
- /* #if ROUND =='Epilogue' */
+ /* #else */
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
/* #endif */
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
* CALCULATE INTERACTIONS *
**************************/
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
{
isaprod = _mm_mul_ps(isai{I},isaj{J});
gbqqfactor = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
gbscale = _mm_mul_ps(isaprod,gbtabscale);
- dvdaj = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+{J},dvda+jnrB+{J},dvda+jnrC+{J},dvda+jnrD+{J});
/* #define INNERFLOPS INNERFLOPS+5 */
/* Calculate generalized born table index - this is a separate table from the normal one,
gbitab = _mm_cvttps_epi32(rt);
gbeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
gbitab = _mm_slli_epi32(gbitab,2);
-
Y = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,0) );
F = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,1) );
G = _mm_load_ps( gbtab + gmx_mm_extract_epi32(gbitab,2) );
fgb = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
dvdatmp = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
dvdasum = _mm_add_ps(dvdasum,dvdatmp);
- gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
- _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
+ /* #if ROUND == 'Loop' */
+ fjptrA = dvda+jnrA;
+ fjptrB = dvda+jnrB;
+ fjptrC = dvda+jnrC;
+ fjptrD = dvda+jnrD;
+ /* #else */
+ /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+ fjptrA = (jnrlistA>=0) ? dvda+jnrA : scratch;
+ fjptrB = (jnrlistB>=0) ? dvda+jnrB : scratch;
+ fjptrC = (jnrlistC>=0) ? dvda+jnrC : scratch;
+ fjptrD = (jnrlistD>=0) ? dvda+jnrD : scratch;
+ /* #endif */
+ gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
/* #define INNERFLOPS INNERFLOPS+13 */
/* #endif */
velec = _mm_mul_ps(qq{I}{J},rinv{I}{J});
/* #endif */
/* #endif */
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
cutoff_mask = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
/* #define INNERFLOPS INNERFLOPS+1 */
/* #endif */
/* #endif */
/* #endif */
/* #if 'vdw' in INTERACTION_FLAGS[I][J] */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
vvdw = _mm_and_ps(vvdw,cutoff_mask);
/* #define INNERFLOPS INNERFLOPS+1 */
/* #endif */
fscal = fvdw;
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
fscal = _mm_and_ps(fscal,cutoff_mask);
/* #define INNERFLOPS INNERFLOPS+1 */
/* #endif */
/* #define INNERFLOPS INNERFLOPS+6 */
/* #if GEOMETRY_J == 'Particle' */
- gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
- tx,ty,tz);
+ /* #if ROUND == 'Loop' */
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ /* #else */
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ /* #endif */
+ gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
/* #define INNERFLOPS INNERFLOPS+3 */
/* #else */
fjx{J} = _mm_add_ps(fjx{J},tx);
/* #endif */
- /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+ /* ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+ /* #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
/* #if 0 ## This and next two lines is a hack to maintain indentation in template file */
{
/* #endif */
/* #endfor */
/* ## End of loop over i-j interaction pairs */
+ /* #if GEOMETRY_J != 'Particle' */
+ /* #if ROUND == 'Loop' */
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+ /* #else */
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+ /* #endif */
+ /* #endif */
+
/* #if GEOMETRY_J == 'Water3' */
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
/* #define INNERFLOPS INNERFLOPS+9 */
/* #elif GEOMETRY_J == 'Water4' */
/* #if 0 in PARTICLES_J */
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* #define INNERFLOPS INNERFLOPS+12 */
/* #else */
- gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
- f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+ gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* #define INNERFLOPS INNERFLOPS+9 */
/* #endif */