Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_sse2_single.c
index 614a2ab909dd8662c98ff2d4c1937a12124b4575..dfa9ba72d26554bc34b0512a9c927e2e229d074c 100644 (file)
@@ -58,10 +58,13 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
     int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
-    real             shX,shY,shZ,rcutoff_scalar;
+    real             rcutoff_scalar;
     real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
     int              vdwioffset0;
     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
@@ -166,14 +169,16 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
     outeriter        = 0;
     inneriter        = 0;
 
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
     /* Start outer loop over neighborlists */
     for(iidx=0; iidx<nri; iidx++)
     {
         /* Load shift vector for this list */
         i_shift_offset   = DIM*shiftidx[iidx];
-        shX              = shiftvec[i_shift_offset+XX];
-        shY              = shiftvec[i_shift_offset+YY];
-        shZ              = shiftvec[i_shift_offset+ZZ];
 
         /* Load limits for loop over neighbors */
         j_index_start    = jindex[iidx];
@@ -184,19 +189,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
         i_coord_offset   = DIM*inr;
 
         /* Load i particle coords and add shift vector */
-        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
-        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
-        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
-        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
-        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
-        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
-        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
-        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
-        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
-        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
         fix0             = _mm_setzero_ps();
         fiy0             = _mm_setzero_ps();
         fiz0             = _mm_setzero_ps();
@@ -223,7 +218,6 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             jnrB             = jjnr[jidx+1];
             jnrC             = jjnr[jidx+2];
             jnrD             = jjnr[jidx+3];
-
             j_coord_offsetA  = DIM*jnrA;
             j_coord_offsetB  = DIM*jnrB;
             j_coord_offsetC  = DIM*jnrC;
@@ -317,6 +311,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
              * CALCULATE INTERACTIONS *
              **************************/
 
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
             r00              = _mm_mul_ps(rsq00,rinv00);
 
             /* Calculate table index by multiplying r with table scale and truncate to integer */
@@ -354,11 +351,16 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             vvdw             = _mm_add_ps(vvdw12,vvdw6);
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
             /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 
             fscal            = fvdw;
 
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
             /* Calculate temporary vectorial force */
             tx               = _mm_mul_ps(fscal,dx00);
             ty               = _mm_mul_ps(fscal,dy00);
@@ -372,6 +374,8 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx0             = _mm_add_ps(fjx0,tx);
             fjy0             = _mm_add_ps(fjy0,ty);
             fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -407,7 +411,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -444,7 +448,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -481,7 +485,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -518,7 +522,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -555,7 +559,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -592,7 +596,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -629,7 +633,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -666,7 +670,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -703,36 +707,38 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
-            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 
-            /* Inner loop uses 383 flops */
+            /* Inner loop uses 387 flops */
         }
 
         if(jidx<j_index_end)
         {
 
             /* Get j neighbor index, and coordinate index */
-            jnrA             = jjnr[jidx];
-            jnrB             = jjnr[jidx+1];
-            jnrC             = jjnr[jidx+2];
-            jnrD             = jjnr[jidx+3];
-
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
             /* Sign of each element will be negative for non-real atoms.
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
              */
             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
-            jnrA       = (jnrA>=0) ? jnrA : 0;
-            jnrB       = (jnrB>=0) ? jnrB : 0;
-            jnrC       = (jnrC>=0) ? jnrC : 0;
-            jnrD       = (jnrD>=0) ? jnrD : 0;
-
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
             j_coord_offsetA  = DIM*jnrA;
             j_coord_offsetB  = DIM*jnrB;
             j_coord_offsetC  = DIM*jnrC;
@@ -826,6 +832,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
              * CALCULATE INTERACTIONS *
              **************************/
 
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
             r00              = _mm_mul_ps(rsq00,rinv00);
             r00              = _mm_andnot_ps(dummy_mask,r00);
 
@@ -864,12 +873,17 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             vvdw             = _mm_add_ps(vvdw12,vvdw6);
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
             /* Update potential sum for this i atom from the interaction with this j atom. */
+            vvdw             = _mm_and_ps(vvdw,cutoff_mask);
             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 
             fscal            = fvdw;
 
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 
             /* Calculate temporary vectorial force */
@@ -885,6 +899,8 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx0             = _mm_add_ps(fjx0,tx);
             fjy0             = _mm_add_ps(fjy0,ty);
             fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -923,7 +939,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -963,7 +979,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1003,7 +1019,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -1043,7 +1059,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -1083,7 +1099,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1123,7 +1139,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -1163,7 +1179,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -1203,7 +1219,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1243,15 +1259,19 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
-            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 
-            /* Inner loop uses 384 flops */
+            /* Inner loop uses 388 flops */
         }
 
         /* End of innermost loop */
@@ -1267,7 +1287,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
         /* Increment number of inner iterations */
         inneriter                  += j_index_end - j_index_start;
 
-        /* Outer loop uses 38 flops */
+        /* Outer loop uses 26 flops */
     }
 
     /* Increment number of outer iterations */
@@ -1275,7 +1295,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_single
 
     /* Update outer/inner flops */
 
-    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*384);
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*388);
 }
 /*
  * Gromacs nonbonded kernel:   nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
@@ -1302,10 +1322,13 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
     int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
-    real             shX,shY,shZ,rcutoff_scalar;
+    real             rcutoff_scalar;
     real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
     int              vdwioffset0;
     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
@@ -1410,14 +1433,16 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
     outeriter        = 0;
     inneriter        = 0;
 
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
     /* Start outer loop over neighborlists */
     for(iidx=0; iidx<nri; iidx++)
     {
         /* Load shift vector for this list */
         i_shift_offset   = DIM*shiftidx[iidx];
-        shX              = shiftvec[i_shift_offset+XX];
-        shY              = shiftvec[i_shift_offset+YY];
-        shZ              = shiftvec[i_shift_offset+ZZ];
 
         /* Load limits for loop over neighbors */
         j_index_start    = jindex[iidx];
@@ -1428,19 +1453,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
         i_coord_offset   = DIM*inr;
 
         /* Load i particle coords and add shift vector */
-        ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
-        iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
-        iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
-        ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
-        iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
-        iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
-        ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
-        iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
-        iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
-        ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
-        iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
-        iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        
         fix0             = _mm_setzero_ps();
         fiy0             = _mm_setzero_ps();
         fiz0             = _mm_setzero_ps();
@@ -1463,7 +1478,6 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             jnrB             = jjnr[jidx+1];
             jnrC             = jjnr[jidx+2];
             jnrD             = jjnr[jidx+3];
-
             j_coord_offsetA  = DIM*jnrA;
             j_coord_offsetB  = DIM*jnrB;
             j_coord_offsetC  = DIM*jnrC;
@@ -1557,6 +1571,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
              * CALCULATE INTERACTIONS *
              **************************/
 
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
             r00              = _mm_mul_ps(rsq00,rinv00);
 
             /* Calculate table index by multiplying r with table scale and truncate to integer */
@@ -1589,8 +1606,12 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fvdw12           = _mm_mul_ps(c12_00,FF);
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
             fscal            = fvdw;
 
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
             /* Calculate temporary vectorial force */
             tx               = _mm_mul_ps(fscal,dx00);
             ty               = _mm_mul_ps(fscal,dy00);
@@ -1604,6 +1625,8 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx0             = _mm_add_ps(fjx0,tx);
             fjy0             = _mm_add_ps(fjy0,ty);
             fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -1634,7 +1657,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -1666,7 +1689,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1698,7 +1721,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -1730,7 +1753,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -1762,7 +1785,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1794,7 +1817,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -1826,7 +1849,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -1858,7 +1881,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -1890,36 +1913,38 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
-            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 
-            /* Inner loop uses 321 flops */
+            /* Inner loop uses 324 flops */
         }
 
         if(jidx<j_index_end)
         {
 
             /* Get j neighbor index, and coordinate index */
-            jnrA             = jjnr[jidx];
-            jnrB             = jjnr[jidx+1];
-            jnrC             = jjnr[jidx+2];
-            jnrD             = jjnr[jidx+3];
-
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
             /* Sign of each element will be negative for non-real atoms.
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
              */
             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
-            jnrA       = (jnrA>=0) ? jnrA : 0;
-            jnrB       = (jnrB>=0) ? jnrB : 0;
-            jnrC       = (jnrC>=0) ? jnrC : 0;
-            jnrD       = (jnrD>=0) ? jnrD : 0;
-
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
             j_coord_offsetA  = DIM*jnrA;
             j_coord_offsetB  = DIM*jnrB;
             j_coord_offsetC  = DIM*jnrC;
@@ -2013,6 +2038,9 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
              * CALCULATE INTERACTIONS *
              **************************/
 
+            if (gmx_mm_any_lt(rsq00,rcutoff2))
+            {
+
             r00              = _mm_mul_ps(rsq00,rinv00);
             r00              = _mm_andnot_ps(dummy_mask,r00);
 
@@ -2046,8 +2074,12 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fvdw12           = _mm_mul_ps(c12_00,FF);
             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 
+            cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
+
             fscal            = fvdw;
 
+            fscal            = _mm_and_ps(fscal,cutoff_mask);
+
             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 
             /* Calculate temporary vectorial force */
@@ -2063,6 +2095,8 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx0             = _mm_add_ps(fjx0,tx);
             fjy0             = _mm_add_ps(fjy0,ty);
             fjz0             = _mm_add_ps(fjz0,tz);
+            
+            }
 
             /**************************
              * CALCULATE INTERACTIONS *
@@ -2095,7 +2129,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -2129,7 +2163,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -2163,7 +2197,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -2197,7 +2231,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -2231,7 +2265,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -2265,7 +2299,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
             /**************************
@@ -2299,7 +2333,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx1             = _mm_add_ps(fjx1,tx);
             fjy1             = _mm_add_ps(fjy1,ty);
             fjz1             = _mm_add_ps(fjz1,tz);
-
+            
             }
 
             /**************************
@@ -2333,7 +2367,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx2             = _mm_add_ps(fjx2,tx);
             fjy2             = _mm_add_ps(fjy2,ty);
             fjz2             = _mm_add_ps(fjz2,tz);
-
+            
             }
 
             /**************************
@@ -2367,15 +2401,19 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
             fjx3             = _mm_add_ps(fjx3,tx);
             fjy3             = _mm_add_ps(fjy3,ty);
             fjz3             = _mm_add_ps(fjz3,tz);
-
+            
             }
 
-            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
 
-            /* Inner loop uses 322 flops */
+            /* Inner loop uses 325 flops */
         }
 
         /* End of innermost loop */
@@ -2386,7 +2424,7 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
         /* Increment number of inner iterations */
         inneriter                  += j_index_end - j_index_start;
 
-        /* Outer loop uses 36 flops */
+        /* Outer loop uses 24 flops */
     }
 
     /* Increment number of outer iterations */
@@ -2394,5 +2432,5 @@ nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_single
 
     /* Update outer/inner flops */
 
-    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*322);
+    inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*325);
 }