Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse4_1_single.c
index 54c53e5c88f7ef43435cc22c5a606f01b72356be..587b07dc118dc855f559b4a1174f725c96351b41 100644 (file)
@@ -222,6 +222,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -256,11 +260,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -288,11 +290,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -320,11 +320,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -352,11 +350,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 116 flops */
         }
@@ -425,6 +428,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -462,11 +469,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -497,11 +502,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -532,11 +535,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -567,11 +568,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 116 flops */
         }
@@ -784,6 +790,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -812,11 +822,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +849,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -870,11 +876,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -899,11 +903,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -972,6 +981,10 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1002,11 +1015,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1033,11 +1044,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1064,11 +1073,9 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1095,11 +1102,16 @@ nb_kernel_ElecCoul_VdwLJ_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }