Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c
index a295dadb83113393c99943c89ad1d3effaa8ff1c..ed609108936a97548bcc9f5e17abbaaca67b6c47 100644 (file)
@@ -226,6 +226,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -308,11 +312,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -355,11 +357,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -402,12 +402,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 157 flops */
         }
 
@@ -470,6 +475,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -557,11 +566,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -608,11 +615,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -659,12 +664,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 160 flops */
         }
 
@@ -880,6 +890,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -948,11 +962,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -987,11 +999,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1026,12 +1036,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 134 flops */
         }
 
@@ -1094,6 +1109,10 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             vdwjidx0C        = 2*vdwtype[jnrC+0];
             vdwjidx0D        = 2*vdwtype[jnrD+0];
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -1165,11 +1184,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy0             = _mm_add_ps(fiy0,ty);
             fiz0             = _mm_add_ps(fiz0,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1207,11 +1224,9 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
             
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1249,12 +1264,17 @@ nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_sse2_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+            
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
-            
+
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
+
             /* Inner loop uses 137 flops */
         }