Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse4_1_single.c
index 71a06791a00743919d0f34a204fed298cce436aa..d12786cf32cc4d094fcd448ce9a5c5f871b48da6 100644 (file)
@@ -196,6 +196,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -222,11 +226,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -254,11 +256,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -286,11 +286,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -350,6 +355,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -379,11 +388,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -414,11 +421,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -449,11 +454,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_VF_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 84 flops */
         }
@@ -640,6 +650,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -663,11 +677,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -692,11 +704,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = f+j_coord_offsetA;
-            fjptrB             = f+j_coord_offsetB;
-            fjptrC             = f+j_coord_offsetC;
-            fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -721,11 +731,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = f+j_coord_offsetA;
             fjptrB             = f+j_coord_offsetB;
             fjptrC             = f+j_coord_offsetC;
             fjptrD             = f+j_coord_offsetD;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }
@@ -785,6 +800,10 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
                                                               charge+jnrC+0,charge+jnrD+0);
 
+            fjx0             = _mm_setzero_ps();
+            fjy0             = _mm_setzero_ps();
+            fjz0             = _mm_setzero_ps();
+
             /**************************
              * CALCULATE INTERACTIONS *
              **************************/
@@ -810,11 +829,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy1             = _mm_add_ps(fiy1,ty);
             fiz1             = _mm_add_ps(fiz1,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -841,11 +858,9 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy2             = _mm_add_ps(fiy2,ty);
             fiz2             = _mm_add_ps(fiz2,tz);
 
-            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
-            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
-            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
-            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -872,11 +887,16 @@ nb_kernel_ElecCoul_VdwNone_GeomW4P1_F_sse4_1_single
             fiy3             = _mm_add_ps(fiy3,ty);
             fiz3             = _mm_add_ps(fiz3,tz);
 
+            fjx0             = _mm_add_ps(fjx0,tx);
+            fjy0             = _mm_add_ps(fjy0,ty);
+            fjz0             = _mm_add_ps(fjz0,tz);
+
             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
+
+           gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 
             /* Inner loop uses 81 flops */
         }