Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_template_sse2_single.pre
index 7173bdddd8ca9defad336ab7a8e00655c0601f04..dc038346f8531bd21dca18a6c8c08b781486afed 100644 (file)
@@ -77,10 +77,13 @@ void
     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
     int              jnrA,jnrB,jnrC,jnrD;
+    int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
-    real             shX,shY,shZ,rcutoff_scalar;
+    real             rcutoff_scalar;
     real             *shiftvec,*fshift,*x,*f;
+    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+    real             scratch[4*DIM];
     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
     /* #for I in PARTICLES_I */
     int              vdwioffset{I};
@@ -99,7 +102,7 @@ void
     /* #endif */
     /* #if 'GeneralizedBorn' in KERNEL_ELEC */
     __m128i          gbitab;
-    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
+    __m128           vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp;
     __m128           minushalf = _mm_set1_ps(-0.5);
     real             *invsqrta,*dvda,*gbtab;
     /* #endif */
@@ -266,14 +269,16 @@ void
     outeriter        = 0;
     inneriter        = 0;
 
+    for(iidx=0;iidx<4*DIM;iidx++)
+    {
+        scratch[iidx] = 0.0;
+    }  
+
     /* Start outer loop over neighborlists */
     for(iidx=0; iidx<nri; iidx++)
     {
         /* Load shift vector for this list */
         i_shift_offset   = DIM*shiftidx[iidx];
-        shX              = shiftvec[i_shift_offset+XX];
-        shY              = shiftvec[i_shift_offset+YY];
-        shZ              = shiftvec[i_shift_offset+ZZ];
 
         /* Load limits for loop over neighbors */
         j_index_start    = jindex[iidx];
@@ -284,14 +289,21 @@ void
         i_coord_offset   = DIM*inr;
 
         /* Load i particle coords and add shift vector */
-        /* ## Loop over i particles, but only include ones that we use - skip e.g. vdw-only sites for elec-only kernel */
-        /*     #for I in PARTICLES_I */
-        ix{I}              = _mm_set1_ps(shX + x[i_coord_offset+DIM*{I}+XX]);
-        iy{I}              = _mm_set1_ps(shY + x[i_coord_offset+DIM*{I}+YY]);
-        iz{I}              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*{I}+ZZ]);
-        /*     #define OUTERFLOPS OUTERFLOPS+3 */
-        /* #endfor */
-
+        /* #if GEOMETRY_I == 'Particle' */
+        gmx_mm_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
+        /* #elif GEOMETRY_I == 'Water3' */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
+        /* #elif GEOMETRY_I == 'Water4' */
+        /*     #if 0 in PARTICLES_I                 */
+        gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+                                                 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #else                                */
+        gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
+                                                 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+        /*     #endif                               */
+        /* #endif                                   */
+        
         /* #if 'Force' in KERNEL_VF */
         /*     #for I in PARTICLES_I */
         fix{I}             = _mm_setzero_ps();
@@ -349,22 +361,25 @@ void
         /* #define INNERFLOPS 0 */
 
             /* Get j neighbor index, and coordinate index */
+            /* #if ROUND =='Loop' */
             jnrA             = jjnr[jidx];
             jnrB             = jjnr[jidx+1];
             jnrC             = jjnr[jidx+2];
             jnrD             = jjnr[jidx+3];
-
-            /* #if ROUND =='Epilogue' */
+            /* #else */
+            jnrlistA         = jjnr[jidx];
+            jnrlistB         = jjnr[jidx+1];
+            jnrlistC         = jjnr[jidx+2];
+            jnrlistD         = jjnr[jidx+3];
             /* Sign of each element will be negative for non-real atoms.
              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
              */
             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
-            jnrA       = (jnrA>=0) ? jnrA : 0;
-            jnrB       = (jnrB>=0) ? jnrB : 0;
-            jnrC       = (jnrC>=0) ? jnrC : 0;
-            jnrD       = (jnrD>=0) ? jnrD : 0;
-
+            jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
+            jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
+            jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
+            jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
             /* #endif */
             j_coord_offsetA  = DIM*jnrA;
             j_coord_offsetB  = DIM*jnrB;
@@ -458,7 +473,8 @@ void
              * CALCULATE INTERACTIONS *
              **************************/
 
-            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
             /*         ## We always calculate rinv/rinvsq above to enable pipelineing in compilers (performance tested on x86) */
             if (gmx_mm_any_lt(rsq{I}{J},rcutoff2))
             {
@@ -542,7 +558,6 @@ void
             isaprod          = _mm_mul_ps(isai{I},isaj{J});
             gbqqfactor       = _mm_xor_ps(signbit,_mm_mul_ps(qq{I}{J},_mm_mul_ps(isaprod,gbinvepsdiff)));
             gbscale          = _mm_mul_ps(isaprod,gbtabscale);
-            dvdaj            = gmx_mm_load_4real_swizzle_ps(dvda+jnrA+{J},dvda+jnrB+{J},dvda+jnrC+{J},dvda+jnrD+{J});
             /*             #define INNERFLOPS INNERFLOPS+5 */
 
             /* Calculate generalized born table index - this is a separate table from the normal one,
@@ -569,8 +584,19 @@ void
             fgb              = _mm_mul_ps(gbqqfactor,_mm_mul_ps(FF,gbscale));
             dvdatmp          = _mm_mul_ps(minushalf,_mm_add_ps(vgb,_mm_mul_ps(fgb,r{I}{J})));
             dvdasum          = _mm_add_ps(dvdasum,dvdatmp);
-            gmx_mm_store_4real_swizzle_ps(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,
-                                          _mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
+            /*                 #if ROUND == 'Loop' */
+            fjptrA           = dvda+jnrA;
+            fjptrB           = dvda+jnrB;
+            fjptrC           = dvda+jnrC;
+            fjptrD           = dvda+jnrD;
+            /*                 #else */
+            /* The pointers to scratch make sure that this code with compilers that take gmx_restrict seriously (e.g. icc 13) really can't screw things up. */
+            fjptrA             = (jnrlistA>=0) ? dvda+jnrA : scratch;
+            fjptrB             = (jnrlistB>=0) ? dvda+jnrB : scratch;
+            fjptrC             = (jnrlistC>=0) ? dvda+jnrC : scratch;
+            fjptrD             = (jnrlistD>=0) ? dvda+jnrD : scratch;
+            /*                 #endif */
+            gmx_mm_increment_4real_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,_mm_mul_ps(dvdatmp,_mm_mul_ps(isaj{J},isaj{J})));
             /*                 #define INNERFLOPS INNERFLOPS+13 */
             /*             #endif */
             velec            = _mm_mul_ps(qq{I}{J},rinv{I}{J});
@@ -769,7 +795,8 @@ void
             /*             #endif */
             /*         #endif */
             /*     #endif */
-            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
             cutoff_mask      = _mm_cmplt_ps(rsq{I}{J},rcutoff2);
             /*         #define INNERFLOPS INNERFLOPS+1 */
             /*     #endif */
@@ -799,7 +826,8 @@ void
             /*             #endif */
             /*         #endif */
             /*         #if 'vdw' in INTERACTION_FLAGS[I][J] */
-            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
             /*                 #define INNERFLOPS INNERFLOPS+1 */
             /*             #endif                                       */
@@ -822,10 +850,11 @@ void
             fscal            = fvdw;
             /*        #endif */
 
-            /*             #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*        ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+            /*        #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
             fscal            = _mm_and_ps(fscal,cutoff_mask);
-            /*                 #define INNERFLOPS INNERFLOPS+1 */
-            /*             #endif                                       */
+            /*             #define INNERFLOPS INNERFLOPS+1 */
+            /*         #endif                                       */
 
             /*             #if ROUND == 'Epilogue' */
             fscal            = _mm_andnot_ps(dummy_mask,fscal);
@@ -843,9 +872,18 @@ void
             /*             #define INNERFLOPS INNERFLOPS+6 */
 
             /* #if GEOMETRY_J == 'Particle'             */
-            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
-                                                   tx,ty,tz);
+            /*     #if ROUND == 'Loop' */
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            /*     #else */
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            /*     #endif */
+            gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,tx,ty,tz);
             /*     #define INNERFLOPS INNERFLOPS+3      */
             /* #else                                    */
             fjx{J}             = _mm_add_ps(fjx{J},tx);
@@ -853,10 +891,11 @@ void
             fjz{J}             = _mm_add_ps(fjz{J},tz);
             /*     #define INNERFLOPS INNERFLOPS+3      */
             /* #endif                                   */
-
+            
             /*     #endif */
 
-            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] */
+            /*     ## Note special check for TIP4P-TIP4P. Since we are cutting of all hydrogen interactions we also cut the LJ-only O-O interaction */
+            /*     #if 'exactcutoff' in INTERACTION_FLAGS[I][J] or (GEOMETRY_I=='Water4' and GEOMETRY_J=='Water4' and 'exactcutoff' in INTERACTION_FLAGS[1][1]) */
             /*         #if 0    ## This and next two lines is a hack to maintain indentation in template file */
             {
                 /*     #endif */
@@ -867,21 +906,32 @@ void
             /* #endfor */
             /* ## End of loop over i-j interaction pairs */
 
+            /* #if GEOMETRY_J != 'Particle' */
+            /*     #if ROUND == 'Loop' */
+            fjptrA             = f+j_coord_offsetA;
+            fjptrB             = f+j_coord_offsetB;
+            fjptrC             = f+j_coord_offsetC;
+            fjptrD             = f+j_coord_offsetD;
+            /*     #else */
+            fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+            fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+            fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+            fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+            /*     #endif */
+            /* #endif */
+
             /* #if GEOMETRY_J == 'Water3'               */
-            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
             /*     #define INNERFLOPS INNERFLOPS+9      */
             /* #elif GEOMETRY_J == 'Water4'             */
             /*     #if 0 in PARTICLES_J                 */
-            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
-                                                   f+j_coord_offsetC,f+j_coord_offsetD,
+            gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
             /*     #define INNERFLOPS INNERFLOPS+12     */
             /*     #else                                */
-            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
-                                                   f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
+            gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
             /*     #define INNERFLOPS INNERFLOPS+9      */
             /*     #endif                               */