Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse4_1_single.c
index 711ff0c3726ef638be6e607b55b1b61adb06a8d6..d325eb1415e80db0a8dd784bcd003d920af0e0b2 100644 (file)
@@ -218,6 +218,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -287,11 +291,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -319,11 +321,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -351,11 +351,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 119 flops */
         }
@@ -419,6 +424,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -493,11 +502,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -528,11 +535,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -563,11 +568,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 120 flops */
         }
@@ -776,6 +786,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -836,11 +850,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -865,11 +877,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -894,11 +904,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 108 flops */
         }
@@ -962,6 +977,10 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1025,11 +1044,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1056,11 +1073,9 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1087,11 +1102,16 @@ nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 109 flops */
         }