Copied SSE intrinsics kernels from x86-64 to ia32
authorErik Lindahl <lindahl@cbr.su.se>
Sun, 22 Aug 2010 20:58:32 +0000 (22:58 +0200)
committerErik Lindahl <lindahl@cbr.su.se>
Sun, 22 Aug 2010 20:58:32 +0000 (22:58 +0200)
src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel400_ia32_sse.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel410_ia32_sse.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse/nb_kernel430_ia32_sse.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel400_ia32_sse2.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel410_ia32_sse2.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse2/nb_kernel430_ia32_sse2.c

index ec24e41fc0c8c41bd97230570f0285649b1b5cca..fd1ea7135573eb9a6fe1baeabd47ba3386dba182 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Options used when generation this file:
  * Language:         c
- * Precision:        double
+ * Precision:        single
  * Threads:          no
  * Software invsqrt: yes
  * Prefetch forces:  no
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel400_ia32_sse2.h"
+#include "nb_kernel400_ia32_sse.h"
 
 
-void nb_kernel400_ia32_sse2(int *           p_nri,
-                              int *           iinr,
-                              int *           jindex,
-                              int *           jjnr,
-                              int *           shift,
-                              double *         shiftvec,
-                              double *         fshift,
-                              int *           gid,
-                              double *         pos,
-                              double *         faction,
-                              double *         charge,
-                              double *         p_facel,
-                              double *         p_krf,
-                              double *         p_crf,
-                              double *         vc,
-                              int *           type,
-                              int *           p_ntype,
-                              double *         vdwparam,
-                              double *         vvdw,
-                              double *         p_tabscale,
-                              double *         VFtab,
-                              double *         invsqrta,
-                              double *         dvda,
-                              double *         p_gbtabscale,
-                              double *         GBtab,
-                              int *           p_nthreads,
-                              int *           count,
-                              void *          mtx,
-                              int *           outeriter,
-                              int *           inneriter,
-                              double *         work)
+
+void nb_kernel400_ia32_sse(int *           p_nri,
+                    int *           iinr,
+                    int *           jindex,
+                    int *           jjnr,
+                    int *           shift,
+                    float *         shiftvec,
+                    float *         fshift,
+                    int *           gid,
+                    float *         pos,
+                    float *         faction,
+                    float *         charge,
+                    float *         p_facel,
+                    float *         p_krf,
+                    float *         p_crf,
+                    float *         vc,
+                    int *           type,
+                    int *           p_ntype,
+                    float *         vdwparam,
+                    float *         vvdw,
+                    float *         p_tabscale,
+                    float *         VFtab,
+                    float *         invsqrta,
+                    float *         dvda,
+                    float *         p_gbtabscale,
+                    float *         GBtab,
+                    int *           p_nthreads,
+                    int *           count,
+                    void *          mtx,
+                    int *           outeriter,
+                    int *           inneriter,
+                    float *         work)
 {
     int           nri,nthreads;
     int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
+    float         shX,shY,shZ;
+       int                       offset;
+    int           jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
+    int           jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
        gmx_gbdata_t *gbdata;
-       double *      gpol;
+       float *        gpol;
+       
+       __m128   iq,qq,qqB,jq,jqB,isai;
+       __m128   ix,iy,iz;
+       __m128   jx,jy,jz;
+       __m128   jxB,jyB,jzB;
+       __m128   dx,dy,dz;
+       __m128   dxB,dyB,dzB;
+       __m128   vctot,vgbtot,dvdasum,gbfactor;
+       __m128   fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128   fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
+       __m128   rinv,isaj,isaprod;
+       __m128   rinvB,isajB,isaprodB;
+       __m128   vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB;
+       __m128   r,rtab;
+       __m128   rB,rtabB;
+       __m128   eps,Y,F,G,H;
+       __m128   epsB,YB,FB,GB,HB;
+       __m128   vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
+       __m128   facel,gbtabscale,mask,dvdaj;
+    
+    __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
+       __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
+       __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
     
-       __m128d  iq,qq,jq,isai;
-       __m128d  ix,iy,iz;
-       __m128d  jx,jy,jz;
-       __m128d  dx,dy,dz;
-       __m128d  vctot,vgbtot,dvdasum,gbfactor;
-       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
-       __m128d  rinv,isaj,isaprod;
-       __m128d  vcoul,fscal,gbscale;
-       __m128d  rinvsq,r,rtab;
-       __m128d  eps,Y,F,G,H;
-       __m128d  vgb,fijGB,dvdatmp;
-       __m128d  facel,gbtabscale,dvdaj;
        __m128i  n0, nnn;
+       __m128i  n0B, nnnB;
        
-       const __m128d neg        = _mm_set1_pd(-1.0);
-       const __m128d zero       = _mm_set1_pd(0.0);
-       const __m128d minushalf  = _mm_set1_pd(-0.5);
-       const __m128d two        = _mm_set1_pd(2.0);
+       const __m128 neg        = {-1.0f,-1.0f,-1.0f,-1.0f};
+       const __m128 zero       = {0.0f,0.0f,0.0f,0.0f};
+       const __m128 minushalf  = {-0.5f,-0.5f,-0.5f,-0.5f};
+       const __m128 two        = {2.0f,2.0f,2.0f,2.0f};
+
+       gbdata          = (gmx_gbdata_t *)work;
+       gpol            = gbdata->gpol;
        
-       gbdata     = (gmx_gbdata_t *)work;
-       gpol       = gbdata->gpol;
-    
-       nri        = *p_nri;
+       nri              = *p_nri;         
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
+    gbfactor         = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+    gbtabscale       = _mm_load1_ps(p_gbtabscale);  
+    facel            = _mm_load1_ps(p_facel);
+
+    nj1              = 0;
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j3A = j3B = j3C = j3D = 0;
+    jx               = _mm_setzero_ps();
+    jy               = _mm_setzero_ps();
+    jz               = _mm_setzero_ps();
     
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-       
-       for(n=0;n<nri;n++)
-       {
+    for(n=0; (n<nri); n++)
+    {
         is3              = 3*shift[n];     
         shX              = shiftvec[is3];  
         shY              = shiftvec[is3+1];
@@ -114,206 +128,535 @@ void nb_kernel400_ia32_sse2(int *           p_nri,
         nj1              = jindex[n+1];    
         ii               = iinr[n];        
         ii3              = 3*ii;           
-               
-               ix               = _mm_set1_pd(shX+pos[ii3+0]);
-               iy               = _mm_set1_pd(shY+pos[ii3+1]);
-               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
-               iq               = _mm_load1_pd(charge+ii);
-               iq               = _mm_mul_pd(iq,facel);
         
-               isai             = _mm_load1_pd(invsqrta+ii);
-                       
-               vctot            = _mm_setzero_pd();
-               vgbtot           = _mm_setzero_pd();
-               dvdasum          = _mm_setzero_pd();
-               fix              = _mm_setzero_pd();
-               fiy              = _mm_setzero_pd();
-               fiz              = _mm_setzero_pd();
-                
-               for(k=nj0;k<nj1-1; k+=2)
+               ix               = _mm_set1_ps(shX+pos[ii3+0]);
+               iy               = _mm_set1_ps(shY+pos[ii3+1]);
+               iz               = _mm_set1_ps(shZ+pos[ii3+2]);
+       
+               iq               = _mm_load1_ps(charge+ii);
+               iq               = _mm_mul_ps(iq,facel);
+                       
+               isai             = _mm_load1_ps(invsqrta+ii);
+                                                               
+               vctot            = _mm_setzero_ps();
+               vgbtot           = _mm_setzero_ps();
+               dvdasum          = _mm_setzero_ps();
+               fix              = _mm_setzero_ps();
+               fiy              = _mm_setzero_ps();
+               fiz              = _mm_setzero_ps();
+
+        for(k=nj0; k<nj1-7; k+=8)
                {
-                       jnrA    = jjnr[k];
-                       jnrB    = jjnr[k+1];
+                       jnrA        = jjnr[k];   
+                       jnrB        = jjnr[k+1];
+                       jnrC        = jjnr[k+2];
+                       jnrD        = jjnr[k+3];
+                       jnrE        = jjnr[k+4];   
+                       jnrF        = jjnr[k+5];
+                       jnrG        = jjnr[k+6];
+                       jnrH        = jjnr[k+7];
+            
+            j3A         = 3*jnrA;  
+                       j3B         = 3*jnrB;
+                       j3C         = 3*jnrC;
+                       j3D         = 3*jnrD;
+            j3E         = 3*jnrE;  
+                       j3F         = 3*jnrF;
+                       j3G         = 3*jnrG;
+                       j3H         = 3*jnrH;
+            
+
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
+
+                       dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       dxB          = _mm_sub_ps(ix,jxB);
+                       dyB          = _mm_sub_ps(iy,jyB);
+                       dzB          = _mm_sub_ps(iz,jzB);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       rsqB         = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
                        
-                       j3A     = jnrA * 3;
-                       j3B     = jnrB * 3;
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
+                       rinvB        = gmx_mm_invsqrt_ps(rsqB);
+
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
+
+                       isaprod      = _mm_mul_ps(isai,isaj);
+                       isaprodB     = _mm_mul_ps(isai,isajB);
+                       qq           = _mm_mul_ps(iq,jq);            
+                       qqB          = _mm_mul_ps(iq,jqB);            
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       vcoulB       = _mm_mul_ps(qqB,rinvB);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+                       fscalB       = _mm_mul_ps(vcoulB,rinvB);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
+            vctot        = _mm_add_ps(vctot,vcoulB);
             
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+            /* Polarization interaction */
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       qqB          = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
+                       gbscaleB     = _mm_mul_ps(isaprodB,gbtabscale);
+
+                       /* Calculate GB table index */
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rB           = _mm_mul_ps(rsqB,rinvB);
+                       rtab         = _mm_mul_ps(r,gbscale);
+                       rtabB        = _mm_mul_ps(rB,gbscaleB);
+
+                       n0           = _mm_cvttps_epi32(rtab);
+                       n0B          = _mm_cvttps_epi32(rtabB);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+            epsB         = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
+                       nnn          = _mm_slli_epi32(n0,2);
+                       nnnB         = _mm_slli_epi32(n0B,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            YB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
+            FB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
+            GB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
+            HB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
             
-                       dx           = _mm_sub_pd(ix,jx);
-                       dy           = _mm_sub_pd(iy,jy);
-                       dz           = _mm_sub_pd(iz,jz);
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+            GB      = _mm_mul_ps(GB,epsB);
+            HB      = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
+            FB      = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
+            YB      = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
+            FB      = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
+            vgbB    = _mm_mul_ps(YB, qqB);           
+            fijGBB  = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
+           
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+            dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
+
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
+            vgbtot  = _mm_add_ps(vgbtot, vgbB);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+            dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
             
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_pd(rinv,rinv);
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
+                       
+            fscal        = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
+            fscalB       = _mm_mul_ps( _mm_sub_ps( fscalB, fijGBB),rinvB );
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
+            txB          = _mm_mul_ps(fscalB,dxB);
+            tyB          = _mm_mul_ps(fscalB,dyB);
+            tzB          = _mm_mul_ps(fscalB,dzB);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
+            fix          = _mm_add_ps(fix,txB);
+            fiy          = _mm_add_ps(fiy,tyB);
+            fiz          = _mm_add_ps(fiz,tzB);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
+        }
+        for(;k<nj1-3; k+=4)
+               {
+                       jnrA        = jjnr[k];   
+                       jnrB        = jjnr[k+1];
+                       jnrC        = jjnr[k+2];
+                       jnrD        = jjnr[k+3];
+            
+            j3A         = 3*jnrA;  
+                       j3B         = 3*jnrB;
+                       j3C         = 3*jnrC;
+                       j3D         = 3*jnrD;
+            
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+            
+                       dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
             
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
-                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
-                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-            
-                       isaprod      = _mm_mul_pd(isai,isaj);
-                       qq           = _mm_mul_pd(iq,jq);            
-                       vcoul        = _mm_mul_pd(qq,rinv);
-                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+                       
+                       isaprod      = _mm_mul_ps(isai,isaj);
+                       qq           = _mm_mul_ps(iq,jq);            
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
             
             /* Polarization interaction */
-                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
             
                        /* Calculate GB table index */
-                       r            = _mm_mul_pd(rsq,rinv);
-                       rtab         = _mm_mul_pd(r,gbscale);
-                       
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
-                       
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rtab         = _mm_mul_ps(r,gbscale);
 
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
+                       n0           = _mm_cvttps_epi32(rtab);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+                       nnn          = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
             
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+   
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
             
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-                                   
-            fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
+            
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+            
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+            
+            fscal        = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
             
             /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
             
             /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
             
             /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
             
             /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
-               }
-               
-               /* In double precision, offset can only be either 0 or 1 */
-               if(k<nj1)
-               {
-                       jnrA    = jjnr[k];
-                       j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
-                       dx           = _mm_sub_sd(ix,jx);
-                       dy           = _mm_sub_sd(iy,jy);
-                       dz           = _mm_sub_sd(iz,jz);
-            
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+        }
+        
+        offset = (nj1-nj0)%4;
+        
+               if(offset!=0)
+        {
+            /* Epilogue loop to take care of non-4-multiple j particles */
+            if(offset==1)
+            {
+                jnrA        = jjnr[k];   
+                j3A         = 3*jnrA;  
+                GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
+                GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
+                GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
+                mask        = mask1;
+            } 
+            else if(offset==2)
+            {
+                jnrA        = jjnr[k];   
+                jnrB        = jjnr[k+1];
+                j3A         = 3*jnrA;  
+                j3B         = 3*jnrB;
+                GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
+                GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
+                GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
+                mask        = mask2;
+            }
+            else
+            {
+                /* offset must be 3 */   
+                jnrA        = jjnr[k];   
+                jnrB        = jjnr[k+1];
+                jnrC        = jjnr[k+2];
+                j3A         = 3*jnrA;  
+                j3B         = 3*jnrB;
+                j3C         = 3*jnrC;
+                GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
+                GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
+                GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
+                mask        = mask3;
+            }
             
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_sd(rinv,rinv);
+            dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
+            rinv         = _mm_and_ps(rinv,mask);
             
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
-                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
-                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-                               
-                       isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
-                       vcoul        = _mm_mul_sd(qq,rinv);
-                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
+            isaprod      = _mm_mul_ps(isai,isaj);
+                       qq           = _mm_and_ps(_mm_mul_ps(iq,jq),mask);        
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
             
             /* Polarization interaction */
-                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
             
                        /* Calculate GB table index */
-                       r            = _mm_mul_sd(rsq,rinv);
-                       rtab         = _mm_mul_sd(r,gbscale);
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rtab         = _mm_mul_ps(r,gbscale);
 
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
+                       n0           = _mm_cvttps_epi32(rtab);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+                       nnn          = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
 
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);            
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
             
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
             
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
-                       
-            fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+            fscal        = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
             
             /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
             
             /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
             
             /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
             
-            /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
-               }
+            if(offset==1)
+            {
+                GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
+            } 
+            else if(offset==2)
+            {
+                GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
+            }
+            else
+            {
+                /* offset must be 3 */
+                GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
+            }
+        }
+        
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
+               gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+        ggid             = gid[n];         
+
+       GMX_MM_UPDATE_1POT_PS(vctot,vc+ggid);
+       GMX_MM_UPDATE_1POT_PS(vgbtot,gpol+ggid);
+       GMX_MM_UPDATE_1POT_PS(dvdasum,dvda+ii);
+    }
+       
+       *outeriter       = nri;            
+    *inneriter       = nj1;            
+}
+
+
+
+/*
+ * Gromacs nonbonded kernel nb_kernel400nf
+ * Coulomb interaction:     Generalized-Born
+ * VdW interaction:         Not calculated
+ * water optimization:      No
+ * Calculate forces:        no
+ */
+void nb_kernel400nf_x86_64_sse(
+                               int *           p_nri,
+                               int *           iinr,
+                               int *           jindex,
+                               int *           jjnr,
+                               int *           shift,
+                               float *         shiftvec,
+                               float *         fshift,
+                               int *           gid,
+                               float *         pos,
+                               float *         faction,
+                               float *         charge,
+                               float *         p_facel,
+                               float *         p_krf,
+                               float *         p_crf,
+                               float *         Vc,
+                               int *           type,
+                               int *           p_ntype,
+                               float *         vdwparam,
+                               float *         Vvdw,
+                               float *         p_tabscale,
+                               float *         VFtab,
+                               float *         invsqrta,
+                               float *         dvda,
+                               float *         p_gbtabscale,
+                               float *         GBtab,
+                               int *           p_nthreads,
+                               int *           count,
+                               void *          mtx,
+                               int *           outeriter,
+                               int *           inneriter,
+                               float *         work)
+{
+    int           nri,ntype,nthreads;
+    float         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+    float         shX,shY,shZ;
+    float         iq;
+    float         qq,vcoul,vctot;
+    float         r,rt,eps,eps2;
+    int           n0,nnn;
+    float         Y,F,Geps,Heps2,Fp,VV;
+    float         isai,isaj,isaprod,gbscale;
+    float         ix1,iy1,iz1;
+    float         jx1,jy1,jz1;
+    float         dx11,dy11,dz11,rsq11,rinv11;
+    const int     fractshift = 12;
+    const int     fractmask = 8388607;
+    const int     expshift = 23;
+    const int     expmask = 2139095040;
+    const int     explsb = 8388608;
+    float         lu;
+    int           iexp,addr;
+    union { unsigned int bval; float fval; } bitpattern,result;
+    
+    nri              = *p_nri;         
+    ntype            = *p_ntype;       
+    nthreads         = *p_nthreads;    
+    facel            = *p_facel;       
+    krf              = *p_krf;         
+    crf              = *p_crf;         
+    tabscale         = *p_tabscale;    
+    gbtabscale       = *p_gbtabscale;  
+    nj1              = 0;              
+    
+    for(n=0; (n<nri); n++)
+    {
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
+        ix1              = shX + pos[ii3+0];
+        iy1              = shY + pos[ii3+1];
+        iz1              = shZ + pos[ii3+2];
+        iq               = facel*charge[ii];
+        isai             = invsqrta[ii];   
+        vctot            = 0;              
         
-        ggid     = gid[n];         
+        for(k=nj0; (k<nj1); k++)
+        {
+            jnr              = jjnr[k];        
+            j3               = 3*jnr;          
+            jx1              = pos[j3+0];      
+            jy1              = pos[j3+1];      
+            jz1              = pos[j3+2];      
+            dx11             = ix1 - jx1;      
+            dy11             = iy1 - jy1;      
+            dz11             = iz1 - jz1;      
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            bitpattern.fval  = rsq11;          
+            iexp             = (((bitpattern.bval)&expmask)>>expshift);
+            addr             = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
+            result.bval      = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
+            lu               = result.fval;    
+            rinv11           = (0.5*lu*(3.0-((rsq11*lu)*lu)));
+            isaj             = invsqrta[jnr];  
+            isaprod          = isai*isaj;      
+            qq               = iq*charge[jnr]; 
+            vcoul            = qq*rinv11;      
+            qq               = isaprod*(-qq);  
+            gbscale          = isaprod*gbtabscale;
+            r                = rsq11*rinv11;   
+            rt               = r*gbscale;      
+            n0               = rt;             
+            eps              = rt-n0;          
+            eps2             = eps*eps;        
+            nnn              = 4*n0;           
+            Y                = GBtab[nnn];     
+            F                = GBtab[nnn+1];   
+            Geps             = eps*GBtab[nnn+2];
+            Heps2            = eps2*GBtab[nnn+3];
+            Fp               = F+Geps+Heps2;   
+            VV               = Y+eps*Fp;       
+            vgb              = qq*VV;          
+            vctot            = vctot + vcoul;  
+        }
         
-        gmx_mm_update_1pot_pd(vctot,vc+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+        ggid             = gid[n];         
+        Vc[ggid]         = Vc[ggid] + vctot;
+    }
     
-       *outeriter   = nri;            
-    *inneriter   = nj1;        
+    *outeriter       = nri;            
+    *inneriter       = nj1;            
 }
+
+
index 759916a8e620d9905766fe3913b5d11baf484a6d..05f376f2d0d4a4f63eca96ab5059637a1fc32668 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Options used when generation this file:
  * Language:         c
- * Precision:        double
+ * Precision:        single
  * Threads:          no
  * Software invsqrt: yes
  * Prefetch forces:  no
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel410_ia32_sse2.h"
+#include "nb_kernel410_ia32_sse.h"
 
 
 
-void nb_kernel410_ia32_sse2(int *           p_nri,
-                                                       int *           iinr,
-                                                       int *           jindex,
-                                                       int *           jjnr,
-                                                       int *           shift,
-                                                       double *         shiftvec,
-                                                       double *         fshift,
-                                                       int *           gid,
-                                                       double *         pos,
-                                                       double *         faction,
-                                                       double *         charge,
-                                                       double *         p_facel,
-                                                       double *         p_krf,
-                                                       double *         p_crf,
-                                                       double *         vc,
-                                                       int *           type,
-                                                       int *           p_ntype,
-                                                       double *         vdwparam,
-                                                       double *         vvdw,
-                                                       double *         p_tabscale,
-                                                       double *         VFtab,
-                                                       double *         invsqrta,
-                                                       double *         dvda,
-                                                       double *         p_gbtabscale,
-                                                       double *         GBtab,
-                                                       int *           p_nthreads,
-                                                       int *           count,
-                                                       void *          mtx,
-                                                       int *           outeriter,
-                                                       int *           inneriter,
-                                                       double *         work)
+void nb_kernel410_ia32_sse(int *           p_nri,
+                    int *           iinr,
+                    int *           jindex,
+                    int *           jjnr,
+                    int *           shift,
+                    float *         shiftvec,
+                    float *         fshift,
+                    int *           gid,
+                    float *         pos,
+                    float *         faction,
+                    float *         charge,
+                    float *         p_facel,
+                    float *         p_krf,
+                    float *         p_crf,
+                    float *         vc,
+                    int *           type,
+                    int *           p_ntype,
+                    float *         vdwparam,
+                    float *         vvdw,
+                    float *         p_tabscale,
+                    float *         VFtab,
+                    float *         invsqrta,
+                    float *         dvda,
+                    float *         p_gbtabscale,
+                    float *         GBtab,
+                    int *           p_nthreads,
+                    int *           count,
+                    void *          mtx,
+                    int *           outeriter,
+                    int *           inneriter,
+                    float *         work)
 {
     int           nri,ntype,nthreads;
     int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+    float         shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
-       int           tjA,tjB;
+    int           jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
+    int           jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
+       int           tjA,tjB,tjC,tjD;
+       int           tjE,tjF,tjG,tjH;
        gmx_gbdata_t *gbdata;
-       double *      gpol;
+       float *        gpol;
+       
+       __m128   iq,qq,qqB,jq,jqB,isai;
+       __m128   ix,iy,iz;
+       __m128   jx,jy,jz;
+       __m128   jxB,jyB,jzB;
+       __m128   dx,dy,dz;
+       __m128   dxB,dyB,dzB;
+       __m128   vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+       __m128   fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128   fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
+       __m128   rinv,isaj,isaprod;
+       __m128   rinvB,isajB,isaprodB;
+       __m128   vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB,c6,c6B,c12,c12B;
+       __m128   rinvsq,r,rtab;
+       __m128   rinvsqB,rB,rtabB;
+       __m128   eps,Y,F,G,H;
+       __m128   epsB,YB,FB,GB,HB;
+       __m128   vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
+       __m128   rinvsix,vvdw6,vvdw12;
+       __m128   rinvsixB,vvdw6B,vvdw12B;
+       __m128   facel,gbtabscale,mask,dvdaj;
+    
+    __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
+       __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
+       __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
     
-       __m128d  iq,qq,jq,isai;
-       __m128d  ix,iy,iz;
-       __m128d  jx,jy,jz;
-       __m128d  dx,dy,dz;
-       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
-       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
-       __m128d  rinv,isaj,isaprod;
-       __m128d  vcoul,fscal,gbscale,c6,c12;
-       __m128d  rinvsq,r,rtab;
-       __m128d  eps,Y,F,G,H;
-       __m128d  vgb,fijGB,dvdatmp;
-       __m128d  rinvsix,vvdw6,vvdw12;
-       __m128d  facel,gbtabscale,dvdaj;
        __m128i  n0, nnn;
+       __m128i  n0B, nnnB;
        
-       const __m128d neg        = _mm_set1_pd(-1.0);
-       const __m128d zero       = _mm_set1_pd(0.0);
-       const __m128d minushalf  = _mm_set1_pd(-0.5);
-       const __m128d two        = _mm_set1_pd(2.0);
-       const __m128d six        = _mm_set1_pd(6.0);
-       const __m128d twelve     = _mm_set1_pd(12.0);
-       
-       gbdata     = (gmx_gbdata_t *)work;
-       gpol       = gbdata->gpol;
+       const __m128 neg        = _mm_set1_ps(-1.0f);
+       const __m128 zero       = _mm_set1_ps(0.0f);
+       const __m128 minushalf  = _mm_set1_ps(-0.5f);
+       const __m128 two        = _mm_set1_ps(2.0f);
+       const __m128 six        = _mm_set1_ps(6.0f);
+       const __m128 twelve     = _mm_set1_ps(12.0f);
 
-       nri        = *p_nri;
-       ntype      = *p_ntype;
+       gbdata          = (gmx_gbdata_t *)work;
+       gpol            = gbdata->gpol;
+       
+       nri              = *p_nri;         
+    ntype            = *p_ntype;       
     
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
+    gbfactor         = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+    gbtabscale       = _mm_load1_ps(p_gbtabscale);  
+    facel            = _mm_load1_ps(p_facel);
 
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
-       
-       for(n=0;n<nri;n++)
-       {
+    nj1              = 0;
+    jnrA = jnrB = jnrC = jnrD = 0;
+    j3A = j3B = j3C = j3D = 0;
+    jx               = _mm_setzero_ps();
+    jy               = _mm_setzero_ps();
+    jz               = _mm_setzero_ps();
+    c6               = _mm_setzero_ps();
+    c12              = _mm_setzero_ps();
+
+    
+    for(n=0; (n<nri); n++)
+    {
         is3              = 3*shift[n];     
         shX              = shiftvec[is3];  
         shY              = shiftvec[is3+1];
@@ -123,241 +138,633 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
         nj1              = jindex[n+1];    
         ii               = iinr[n];        
         ii3              = 3*ii;           
-               
-               ix               = _mm_set1_pd(shX+pos[ii3+0]);
-               iy               = _mm_set1_pd(shY+pos[ii3+1]);
-               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
-               iq               = _mm_load1_pd(charge+ii);
-               iq               = _mm_mul_pd(iq,facel);
-        
-               isai             = _mm_load1_pd(invsqrta+ii);
         
+               ix               = _mm_set1_ps(shX+pos[ii3+0]);
+               iy               = _mm_set1_ps(shY+pos[ii3+1]);
+               iz               = _mm_set1_ps(shZ+pos[ii3+2]);
+       
+               iq               = _mm_load1_ps(charge+ii);
+               iq               = _mm_mul_ps(iq,facel);
+                       
+               isai             = _mm_load1_ps(invsqrta+ii);
+                                                               
                nti              = 2*ntype*type[ii];
                
-               vctot            = _mm_setzero_pd();
-               vvdwtot          = _mm_setzero_pd();
-               vgbtot           = _mm_setzero_pd();
-               dvdasum          = _mm_setzero_pd();
-               fix              = _mm_setzero_pd();
-               fiy              = _mm_setzero_pd();
-               fiz              = _mm_setzero_pd();
-        
-               for(k=nj0;k<nj1-1; k+=2)
+               vctot            = _mm_setzero_ps();
+               vvdwtot          = _mm_setzero_ps();
+               vgbtot           = _mm_setzero_ps();
+               dvdasum          = _mm_setzero_ps();
+               fix              = _mm_setzero_ps();
+               fiy              = _mm_setzero_ps();
+               fiz              = _mm_setzero_ps();
+       
+        for(k=nj0; k<nj1-7; k+=8)
                {
-                       jnrA    = jjnr[k];
-                       jnrB    = jjnr[k+1];
-                       
-                       j3A     = jnrA * 3;
-                       j3B     = jnrB * 3;
-
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+                       jnrA        = jjnr[k];   
+                       jnrB        = jjnr[k+1];
+                       jnrC        = jjnr[k+2];
+                       jnrD        = jjnr[k+3];
+                       jnrE        = jjnr[k+4];   
+                       jnrF        = jjnr[k+5];
+                       jnrG        = jjnr[k+6];
+                       jnrH        = jjnr[k+7];
+            
+            j3A         = 3*jnrA;  
+                       j3B         = 3*jnrB;
+                       j3C         = 3*jnrC;
+                       j3D         = 3*jnrD;
+            j3E         = 3*jnrE;  
+                       j3F         = 3*jnrF;
+                       j3G         = 3*jnrG;
+                       j3H         = 3*jnrH;
+            
 
-                       dx           = _mm_sub_pd(ix,jx);
-                       dy           = _mm_sub_pd(iy,jy);
-                       dz           = _mm_sub_pd(iz,jz);
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
 
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+                       dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       dxB          = _mm_sub_ps(ix,jxB);
+                       dyB          = _mm_sub_ps(iy,jyB);
+                       dzB          = _mm_sub_ps(iz,jzB);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       rsqB         = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
+                       
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
+                       rinvB        = gmx_mm_invsqrt_ps(rsqB);
+                       rinvsq       = _mm_mul_ps(rinv,rinv);
+                       rinvsqB      = _mm_mul_ps(rinvB,rinvB);
 
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_pd(rinv,rinv);
-            
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
-                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
-                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-            
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
+
             /* Lennard-Jones */
             tjA          = nti+2*type[jnrA];
                        tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                       tjC          = nti+2*type[jnrC];
+                       tjD          = nti+2*type[jnrD];
+            tjE          = nti+2*type[jnrE];
+                       tjF          = nti+2*type[jnrF];
+                       tjG          = nti+2*type[jnrG];
+                       tjH          = nti+2*type[jnrH];
+            
+            GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
+            GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjE,vdwparam+tjF,vdwparam+tjG,vdwparam+tjH,c6B,c12B);
                        
-                       isaprod      = _mm_mul_pd(isai,isaj);
-                       qq           = _mm_mul_pd(iq,jq);            
-                       vcoul        = _mm_mul_pd(qq,rinv);
-                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
+                       isaprod      = _mm_mul_ps(isai,isaj);
+                       isaprodB     = _mm_mul_ps(isai,isajB);
+                       qq           = _mm_mul_ps(iq,jq);            
+                       qqB          = _mm_mul_ps(iq,jqB);            
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       vcoulB       = _mm_mul_ps(qqB,rinvB);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+                       fscalB       = _mm_mul_ps(vcoulB,rinvB);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
+            vctot        = _mm_add_ps(vctot,vcoulB);
             
             /* Polarization interaction */
-                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       qqB          = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
+                       gbscaleB     = _mm_mul_ps(isaprodB,gbtabscale);
+
                        /* Calculate GB table index */
-                       r            = _mm_mul_pd(rsq,rinv);
-                       rtab         = _mm_mul_pd(r,gbscale);
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rB           = _mm_mul_ps(rsqB,rinvB);
+                       rtab         = _mm_mul_ps(r,gbscale);
+                       rtabB        = _mm_mul_ps(rB,gbscaleB);
                        
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
+                       n0           = _mm_cvttps_epi32(rtab);
+                       n0B          = _mm_cvttps_epi32(rtabB);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+            epsB         = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
+                       nnn          = _mm_slli_epi32(n0,2);
+                       nnnB         = _mm_slli_epi32(n0B,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            YB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
+            FB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
+            GB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
+            HB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
+            
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+            GB      = _mm_mul_ps(GB,epsB);
+            HB      = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
+            FB      = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
+            YB      = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
+            FB      = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
+            vgbB    = _mm_mul_ps(YB, qqB);           
+            fijGBB  = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
+            
+            
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+            dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
+
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
+            vgbtot  = _mm_add_ps(vgbtot, vgbB);
+            
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+            dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
+            
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
                        
-                       rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
-                       rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
+                       rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
+                       rinvsixB     = _mm_mul_ps(rinvsqB,rinvsqB);
+                       rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
+                       rinvsixB     = _mm_mul_ps(rinvsixB,rinvsqB);
                        
-                       vvdw6        = _mm_mul_pd(c6,rinvsix);
-                       vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
-                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
-            
-            fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
-                                                 _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
-                                                            _mm_mul_pd(six,vvdw6))),
-                                      _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-                        
+                       vvdw6        = _mm_mul_ps(c6,rinvsix);
+                       vvdw6B       = _mm_mul_ps(c6B,rinvsixB);
+                       vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+                       vvdw12B      = _mm_mul_ps(c12B, _mm_mul_ps(rinvsixB,rinvsixB));
+                       vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+                       vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12B,vvdw6B));
+
+            fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
+                                                       _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+                                                                  _mm_mul_ps(six,vvdw6))),
+                                      _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
+
+            fscalB       = _mm_sub_ps(_mm_mul_ps(rinvsqB, 
+                                                 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12B),
+                                                            _mm_mul_ps(six,vvdw6B))),
+                                      _mm_mul_ps( _mm_sub_ps( fijGBB,fscalB),rinvB ));
+            
             /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
-            
+
             /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
+            txB          = _mm_mul_ps(fscalB,dxB);
+            tyB          = _mm_mul_ps(fscalB,dyB);
+            tzB          = _mm_mul_ps(fscalB,dzB);
             
             /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
+            fix          = _mm_add_ps(fix,txB);
+            fiy          = _mm_add_ps(fiy,tyB);
+            fiz          = _mm_add_ps(fiz,tzB);
             
             /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
-               }
-               
-               /* In double precision, offset can only be either 0 or 1 */
-               if(k<nj1)
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
+        }
+        for(;k<nj1-3; k+=4)
                {
-                       jnrA    = jjnr[k];
-                       
-                       j3A     = jnrA * 3;
+                       jnrA        = jjnr[k];   
+                       jnrB        = jjnr[k+1];
+                       jnrC        = jjnr[k+2];
+                       jnrD        = jjnr[k+3];
             
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+            j3A         = 3*jnrA;  
+                       j3B         = 3*jnrB;
+                       j3C         = 3*jnrC;
+                       j3D         = 3*jnrD;
             
-                       dx           = _mm_sub_sd(ix,jx);
-                       dy           = _mm_sub_sd(iy,jy);
-                       dz           = _mm_sub_sd(iz,jz);
             
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
             
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_sd(rinv,rinv);
+                       dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
+                       rinvsq       = _mm_mul_ps(rinv,rinv);
             
                        /***********************************/
                        /* INTERACTION SECTION STARTS HERE */
                        /***********************************/
-                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
-                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+                       GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+                       GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
             
             /* Lennard-Jones */
             tjA          = nti+2*type[jnrA];
+                       tjB          = nti+2*type[jnrB];
+                       tjC          = nti+2*type[jnrC];
+                       tjD          = nti+2*type[jnrD];
             
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+            GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
                        
-                       isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
-                       vcoul        = _mm_mul_sd(qq,rinv);
-                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
+                       isaprod      = _mm_mul_ps(isai,isaj);
+                       qq           = _mm_mul_ps(iq,jq);            
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
             
             /* Polarization interaction */
-                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
             
                        /* Calculate GB table index */
-                       r            = _mm_mul_sd(rsq,rinv);
-                       rtab         = _mm_mul_sd(r,gbscale);
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rtab         = _mm_mul_ps(r,gbscale);
                        
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
+                       n0           = _mm_cvttps_epi32(rtab);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+                       nnn          = _mm_slli_epi32(n0,2);
                        
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+            
+            
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
+            
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+            
+            GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+            
                        
-                       rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
-                       rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
+                       rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
                        
-                       vvdw6        = _mm_mul_sd(c6,rinvsix);
-                       vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
-                       vvdwtot      = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-        
-            fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
-                                                 _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
-                                                            _mm_mul_sd(six,vvdw6))),
-                                      _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+                       vvdw6        = _mm_mul_ps(c6,rinvsix);
+                       vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+            
+            fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
+                                                 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+                                                            _mm_mul_ps(six,vvdw6))),
+                                      _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
             
             /***********************************/
                        /*  INTERACTION SECTION ENDS HERE  */
                        /***********************************/
             
             /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
             
             /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
             
             /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
-               }
+                       GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+        }
+        
+        offset = (nj1-nj0)%4;
+        
+               if(offset!=0)
+        {
+            /* Epilogue loop to take care of non-4-multiple j particles */
+            if(offset==1)
+            {
+                jnrA        = jjnr[k];   
+                j3A         = 3*jnrA;  
+                tjA          = nti+2*type[jnrA];
+                GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
+                GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
+                GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
+                GMX_MM_LOAD_1PAIR_PS(vdwparam+tjA,c6,c12);
+                mask        = mask1;
+            } 
+            else if(offset==2)
+            {
+                jnrA        = jjnr[k];   
+                jnrB        = jjnr[k+1];
+                j3A         = 3*jnrA;  
+                j3B         = 3*jnrB;
+                tjA          = nti+2*type[jnrA];
+                tjB          = nti+2*type[jnrB];
+                GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
+                GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
+                GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
+                GMX_MM_LOAD_2PAIRS_PS(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                mask        = mask2;
+            }
+            else
+            {
+                /* offset must be 3 */   
+                jnrA        = jjnr[k];   
+                jnrB        = jjnr[k+1];
+                jnrC        = jjnr[k+2];
+                j3A         = 3*jnrA;  
+                j3B         = 3*jnrB;
+                j3C         = 3*jnrC;
+                tjA          = nti+2*type[jnrA];
+                tjB          = nti+2*type[jnrB];
+                tjC          = nti+2*type[jnrC];
+                GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
+                GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
+                GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
+                GMX_MM_LOAD_3PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,c6,c12);
+                mask        = mask3;
+            }
+            
+            dx           = _mm_sub_ps(ix,jx);
+                       dy           = _mm_sub_ps(iy,jy);
+                       dz           = _mm_sub_ps(iz,jz);
+                       
+                       rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
+                       
+                       /* invsqrt */                                                           
+                       rinv         = gmx_mm_invsqrt_ps(rsq);
+            rinv         = _mm_and_ps(rinv,mask);
+            
+                       rinvsq       = _mm_mul_ps(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+            isaprod      = _mm_mul_ps(isai,isaj);
+                       qq           = _mm_and_ps(_mm_mul_ps(iq,jq),mask);        
+                       vcoul        = _mm_mul_ps(qq,rinv);
+                       fscal        = _mm_mul_ps(vcoul,rinv);                                 
+            vctot        = _mm_add_ps(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+                       gbscale      = _mm_mul_ps(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_ps(rsq,rinv);
+                       rtab         = _mm_mul_ps(r,gbscale);
+                       
+                       n0           = _mm_cvttps_epi32(rtab);
+            eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+                       nnn          = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_ps */                    
+            Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+            F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+            G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+            H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+            _MM_TRANSPOSE4_PS(Y,F,G,H);
+            
+            G       = _mm_mul_ps(G,eps);
+            H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+            F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
+            Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+            F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+            vgb     = _mm_mul_ps(Y, qq);           
+            fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+            
+            dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);            
+            vgbtot  = _mm_add_ps(vgbtot, vgb);
+            
+            dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+            
+            /* Lennard-Jones */
+            
+                       rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
+                       
+                       vvdw6        = _mm_mul_ps(c6,rinvsix);
+                       vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+            
+            fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
+                                                 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+                                                            _mm_mul_ps(six,vvdw6))),
+                                      _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_ps(fscal,dx);
+            ty           = _mm_mul_ps(fscal,dy);
+            tz           = _mm_mul_ps(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_ps(fix,tx);
+            fiy          = _mm_add_ps(fiy,ty);
+            fiz          = _mm_add_ps(fiz,tz);
+            
+            if(offset==1)
+            {
+                GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
+            } 
+            else if(offset==2)
+            {
+                GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
+            }
+            else
+            {
+                /* offset must be 3 */
+                GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
+                GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
+            }
+        }
+        
+        dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
+               gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+        ggid             = gid[n];         
+       GMX_MM_UPDATE_4POT_PS(vctot,vc+ggid,vvdwtot,vvdw+ggid,vgbtot,gpol+ggid,dvdasum,dvda+ii);
+    }
+       
+       *outeriter       = nri;            
+    *inneriter       = nj1;            
+}
+
 
-        ggid     = gid[n];         
-        
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
 
-       *outeriter   = nri;            
-    *inneriter   = nj1;        
+/*
+ * Gromacs nonbonded kernel nb_kernel410nf
+ * Coulomb interaction:     Generalized-Born
+ * VdW interaction:         Lennard-Jones
+ * water optimization:      No
+ * Calculate forces:        no
+ */
+void nb_kernel410nf_x86_64_sse(
+                    int *           p_nri,
+                    int *           iinr,
+                    int *           jindex,
+                    int *           jjnr,
+                    int *           shift,
+                    float *         shiftvec,
+                    float *         fshift,
+                    int *           gid,
+                    float *         pos,
+                    float *         faction,
+                    float *         charge,
+                    float *         p_facel,
+                    float *         p_krf,
+                    float *         p_crf,
+                    float *         Vc,
+                    int *           type,
+                    int *           p_ntype,
+                    float *         vdwparam,
+                    float *         vvdw,
+                    float *         p_tabscale,
+                    float *         VFtab,
+                    float *         invsqrta,
+                    float *         dvda,
+                    float *         p_gbtabscale,
+                    float *         GBtab,
+                    int *           p_nthreads,
+                    int *           count,
+                    void *          mtx,
+                    int *           outeriter,
+                    int *           inneriter,
+                    float *         work)
+{
+    int           nri,ntype,nthreads;
+    float         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+    float         shX,shY,shZ;
+    float         rinvsq;
+    float         iq;
+    float         qq,vcoul,vctot;
+    int           nti;
+    int           tj;
+    float         rinvsix;
+    float         vvdw6,vvdwtot;
+    float         vvdw12;
+    float         r,rt,eps,eps2;
+    int           n0,nnn;
+    float         Y,F,Geps,Heps2,Fp,VV;
+    float         isai,isaj,isaprod,gbscale;
+    float         ix1,iy1,iz1;
+    float         jx1,jy1,jz1;
+    float         dx11,dy11,dz11,rsq11,rinv11;
+    float         c6,c12;
+    const int     fractshift = 12;
+    const int     fractmask = 8388607;
+    const int     expshift = 23;
+    const int     expmask = 2139095040;
+    const int     explsb = 8388608;
+    float         lu;
+    int           iexp,addr;
+    union { unsigned int bval; float fval; } bitpattern,result;
+
+    nri              = *p_nri;         
+    ntype            = *p_ntype;       
+    nthreads         = *p_nthreads;    
+    facel            = *p_facel;       
+    krf              = *p_krf;         
+    crf              = *p_crf;         
+    tabscale         = *p_tabscale;    
+    gbtabscale       = *p_gbtabscale;  
+    nj1              = 0;              
+
+    for(n=0; (n<nri); n++)
+    {
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
+        ix1              = shX + pos[ii3+0];
+        iy1              = shY + pos[ii3+1];
+        iz1              = shZ + pos[ii3+2];
+        iq               = facel*charge[ii];
+        isai             = invsqrta[ii];   
+        nti              = 2*ntype*type[ii];
+        vctot            = 0;              
+        vvdwtot          = 0;              
+        
+        for(k=nj0; (k<nj1); k++)
+        {
+            jnr              = jjnr[k];        
+            j3               = 3*jnr;          
+            jx1              = pos[j3+0];      
+            jy1              = pos[j3+1];      
+            jz1              = pos[j3+2];      
+            dx11             = ix1 - jx1;      
+            dy11             = iy1 - jy1;      
+            dz11             = iz1 - jz1;      
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            bitpattern.fval  = rsq11;          
+            iexp             = (((bitpattern.bval)&expmask)>>expshift);
+            addr             = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
+            result.bval      = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
+            lu               = result.fval;    
+            rinv11           = (0.5*lu*(3.0-((rsq11*lu)*lu)));
+            isaj             = invsqrta[jnr];  
+            isaprod          = isai*isaj;      
+            qq               = iq*charge[jnr]; 
+            vcoul            = qq*rinv11;      
+            qq               = isaprod*(-qq);  
+            gbscale          = isaprod*gbtabscale;
+            tj               = nti+2*type[jnr];
+            c6               = vdwparam[tj];   
+            c12              = vdwparam[tj+1]; 
+            rinvsq           = rinv11*rinv11;  
+            r                = rsq11*rinv11;   
+            rt               = r*gbscale;      
+            n0               = rt;             
+            eps              = rt-n0;          
+            eps2             = eps*eps;        
+            nnn              = 4*n0;           
+            Y                = GBtab[nnn];     
+            F                = GBtab[nnn+1];   
+            Geps             = eps*GBtab[nnn+2];
+            Heps2            = eps2*GBtab[nnn+3];
+            Fp               = F+Geps+Heps2;   
+            VV               = Y+eps*Fp;       
+            vgb              = qq*VV;          
+            vctot            = vctot + vcoul;  
+            rinvsix          = rinvsq*rinvsq*rinvsq;
+            vvdw6            = c6*rinvsix;     
+            vvdw12           = c12*rinvsix*rinvsix;
+            vvdwtot          = vvdwtot+vvdw12-vvdw6;
+        }
+        
+        ggid             = gid[n];         
+        Vc[ggid]         = Vc[ggid] + vctot;
+        vvdw[ggid]       = vvdw[ggid] + vvdwtot;
+    }
+    
+    *outeriter       = nri;            
+    *inneriter       = nj1;            
 }
+
+
index 87159d59deb0ca5a9082ab249b0c4f413e1625c0..90f260f66445d4544f71174b79cec8314318d922 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Options used when generation this file:
  * Language:         c
- * Precision:        double
+ * Precision:        single
  * Threads:          no
  * Software invsqrt: yes
  * Prefetch forces:  no
 #include<math.h>
 #include<vec.h>
 
+
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
 
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-#include "nb_kernel430_ia32_sse2.h"
+#include "nb_kernel430_ia32_sse.h"
+
+
 
-void nb_kernel430_ia32_sse2(int *           p_nri,
-                              int *           iinr,
-                              int *           jindex,
-                              int *           jjnr,
-                              int *           shift,
-                              double *         shiftvec,
-                              double *         fshift,
-                              int *           gid,
-                              double *         pos,
-                              double *         faction,
-                              double *         charge,
-                              double *         p_facel,
-                              double *         p_krf,
-                              double *         p_crf,
-                              double *         vc,
-                              int *           type,
-                              int *           p_ntype,
-                              double *         vdwparam,
-                              double *         vvdw,
-                              double *         p_tabscale,
-                              double *         VFtab,
-                              double *         invsqrta,
-                              double *         dvda,
-                              double *         p_gbtabscale,
-                              double *         GBtab,
-                              int *           p_nthreads,
-                              int *           count,
-                              void *          mtx,
-                              int *           outeriter,
-                              int *           inneriter,
-                              double *         work)
+void nb_kernel430_ia32_sse(int *           p_nri,
+                                                  int *           iinr,
+                                                  int *           jindex,
+                                                  int *           jjnr,
+                                                  int *           shift,
+                                                  float *         shiftvec,
+                                                  float *         fshift,
+                                                  int *           gid,
+                                                  float *         pos,
+                                                  float *         faction,
+                                                  float *         charge,
+                                                  float *         p_facel,
+                                                  float *         p_krf,
+                                                  float *         p_crf,
+                                                  float *         Vc,
+                                                  int *           type,
+                                                  int *           p_ntype,
+                                                  float *         vdwparam,
+                                                  float *         Vvdw,
+                                                  float *         p_tabscale,
+                                                  float *         VFtab,
+                                                  float *         invsqrta,
+                                                  float *         dvda,
+                                                  float *         p_gbtabscale,
+                                                  float *         GBtab,
+                                                  int *           p_nthreads,
+                                                  int *           count,
+                                                  void *          mtx,
+                                                  int *           outeriter,
+                                                  int *           inneriter,
+                                                  float *         work)
 {
     int           nri,ntype,nthreads;
+    float         facel,krf,crf,tabscale,gbtabscale;
     int           n,ii,is3,ii3,k,nj0,nj1,ggid;
-    double        shX,shY,shZ;
+    float         shX,shY,shZ;
        int                       offset,nti;
-    int           jnrA,jnrB;
-    int           j3A,j3B;
-       int           tjA,tjB;
+       int           jnr,jnr2,jnr3,jnr4,j3,j23,j33,j43;
+       int           tj,tj2,tj3,tj4;
        gmx_gbdata_t *gbdata;
-       double *      gpol;
-    
-       __m128d  iq,qq,jq,isai;
-       __m128d  ix,iy,iz;
-       __m128d  jx,jy,jz;
-       __m128d  dx,dy,dz;
-       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
-       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
-       __m128d  rinv,isaj,isaprod;
-       __m128d  vcoul,fscal,gbscale,c6,c12;
-       __m128d  rinvsq,r,rtab;
-       __m128d  eps,Y,F,G,H;
-    __m128d  VV,FF,Fp;
-       __m128d  vgb,fijGB,dvdatmp;
-       __m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
-       __m128d  facel,gbtabscale,dvdaj;
-    __m128d  fijD,fijR,fijC;
-    __m128d  xmm1,tabscale,eps2;
+       float *        gpol;
+       
+       __m128   iq,qq,q,isai;
+       __m128   ix,iy,iz;
+       __m128   jx,jy,jz;
+       __m128   xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+       __m128   dx1,dy1,dz1;
+       __m128   vctot,Vvdwtot,dvdasum;
+       __m128   fix,fiy,fiz,rsq;
+       __m128   t1,t2,t3;
+       __m128   rinv,isaj,isaprod;
+       __m128   vcoul,fscal,gbscale,c6,c12;
+       __m128   rinvsq,dvdaj,r,rt;
+       __m128   eps,eps2,Y,F,G,H,Geps,Heps2;
+       __m128   Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp;
+       __m128   rinvsix,Vvdw6,Vvdw12,Vvdwtmp,vgbtot,n0f;
+       __m128   fac_sse,tabscale_sse,gbtabscale_sse;
+       
        __m128i  n0, nnn;
-    
+       const __m128 neg    = _mm_set1_ps(-1.0f);
+       const __m128 zero   = _mm_set1_ps(0.0f);
+    const __m128 half   = _mm_set1_ps(0.5f);
+       const __m128 two    = _mm_set1_ps(2.0f);
+       const __m128 three  = _mm_set1_ps(3.0f);
+       const __m128 six    = _mm_set1_ps(6.0f);
+    const __m128 twelwe = _mm_set1_ps(12.0f);
        
-       const __m128d neg        = _mm_set1_pd(-1.0);
-       const __m128d zero       = _mm_set1_pd(0.0);
-       const __m128d minushalf  = _mm_set1_pd(-0.5);
-       const __m128d two        = _mm_set1_pd(2.0);
+       __m128i four        = _mm_set1_epi32(4);
+       __m128i maski       = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);     
+       __m128i mask        = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);   
        
-       gbdata     = (gmx_gbdata_t *)work;
-       gpol       = gbdata->gpol;
-    
-       nri        = *p_nri;
-       ntype      = *p_ntype;
-    
-    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
-    gbtabscale = _mm_load1_pd(p_gbtabscale);  
-    facel      = _mm_load1_pd(p_facel);
-    tabscale   = _mm_load1_pd(p_tabscale);
-    
-    nj1         = 0;
-    jnrA = jnrB = 0;
-    j3A = j3B   = 0;
-    jx          = _mm_setzero_pd();
-    jy          = _mm_setzero_pd();
-    jz          = _mm_setzero_pd();
-    c6          = _mm_setzero_pd();
-    c12         = _mm_setzero_pd();
+       float vct,vdwt,vgbt,dva,isai_f;
        
-       for(n=0;n<nri;n++)
-       {
+       gbdata          = (gmx_gbdata_t *)work;
+       gpol            = gbdata->gpol;
+               
+       nri              = *p_nri;         
+    ntype            = *p_ntype;       
+    nthreads         = *p_nthreads;    
+    facel            = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
+    krf              = *p_krf;         
+    crf              = *p_crf;         
+    tabscale         = *p_tabscale;    
+    gbtabscale       = *p_gbtabscale;  
+    nj1              = 0;
+
+       /* Splat variables */
+       fac_sse        = _mm_load1_ps(&facel);
+       tabscale_sse   = _mm_load1_ps(&tabscale);
+       gbtabscale_sse = _mm_load1_ps(&gbtabscale);
+       
+       
+       /* Keep the compiler happy */
+       Vvdwtmp = _mm_setzero_ps();
+       dvdatmp = _mm_setzero_ps();
+       vcoul   = _mm_setzero_ps();
+       vgb     = _mm_setzero_ps();
+       t1      = _mm_setzero_ps();
+       t2      = _mm_setzero_ps();
+       t3      = _mm_setzero_ps();
+       xmm1    = _mm_setzero_ps();
+       xmm2    = _mm_setzero_ps();
+       xmm3    = _mm_setzero_ps();
+       xmm4    = _mm_setzero_ps();
+       j23     = j33  = 0;
+       jnr2    = jnr3 = 0;
+       
+    for(n=0; (n<nri); n++)
+    {
         is3              = 3*shift[n];     
         shX              = shiftvec[is3];  
         shY              = shiftvec[is3+1];
@@ -124,335 +148,989 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
         nj1              = jindex[n+1];    
         ii               = iinr[n];        
         ii3              = 3*ii;           
-               
-               ix               = _mm_set1_pd(shX+pos[ii3+0]);
-               iy               = _mm_set1_pd(shY+pos[ii3+1]);
-               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
-        
-               iq               = _mm_load1_pd(charge+ii);
-               iq               = _mm_mul_pd(iq,facel);
-        
-               isai             = _mm_load1_pd(invsqrta+ii);
         
+               ix               = _mm_set1_ps(shX+pos[ii3+0]);
+               iy               = _mm_set1_ps(shY+pos[ii3+1]);
+               iz               = _mm_set1_ps(shZ+pos[ii3+2]);
+               
+               iq               = _mm_load1_ps(charge+ii);
+               iq               = _mm_mul_ps(iq,fac_sse);
+               
+        isai_f           = invsqrta[ii];   
+               isai             = _mm_load1_ps(&isai_f);
+               
                nti              = 2*ntype*type[ii];
                
-               vctot            = _mm_setzero_pd();
-               vvdwtot          = _mm_setzero_pd();
-               vgbtot           = _mm_setzero_pd();
-               dvdasum          = _mm_setzero_pd();
-               fix              = _mm_setzero_pd();
-               fiy              = _mm_setzero_pd();
-               fiz              = _mm_setzero_pd();
+               vctot            = _mm_setzero_ps();
+               Vvdwtot          = _mm_setzero_ps();
+               dvdasum          = _mm_setzero_ps();
+               vgbtot           = _mm_setzero_ps();
+               fix              = _mm_setzero_ps();
+               fiy              = _mm_setzero_ps();
+               fiz              = _mm_setzero_ps();
+               
+               offset           = (nj1-nj0)%4;
         
-               for(k=nj0;k<nj1-1; k+=2)
+        for(k=nj0;k<nj1-offset;k+=4)
                {
-                       jnrA    = jjnr[k];
-                       jnrB    = jjnr[k+1];
-                       
-                       j3A     = jnrA * 3;
-                       j3B     = jnrB * 3;
-            
-            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
-            
-                       dx           = _mm_sub_pd(ix,jx);
-                       dy           = _mm_sub_pd(iy,jy);
-                       dz           = _mm_sub_pd(iz,jz);
-            
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_pd(rinv,rinv);
-            
-                       /***********************************/
-                       /* INTERACTION SECTION STARTS HERE */
-                       /***********************************/
-                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
-                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-            
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-                       tjB          = nti+2*type[jnrB];
-            
-            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
-                       
-                       isaprod      = _mm_mul_pd(isai,isaj);
-                       qq           = _mm_mul_pd(iq,jq);            
-                       vcoul        = _mm_mul_pd(qq,rinv);
-                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
-            vctot        = _mm_add_pd(vctot,vcoul);
-            
-            /* Polarization interaction */
-                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
-            
-                       /* Calculate GB table index */
-                       r            = _mm_mul_pd(rsq,rinv);
-                       rtab         = _mm_mul_pd(r,gbscale);
-                       
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
-                       
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
-            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
-            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
-            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
-            vgb     = _mm_mul_pd(Y, qq);           
-            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_pd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-                       
-            /* Calculate VDW table index */
-                       rtab    = _mm_mul_pd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rtab);
-                       eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
-                       eps2    = _mm_mul_pd(eps,eps);
+                       jnr     = jjnr[k];   
+                       jnr2    = jjnr[k+1];
+                       jnr3    = jjnr[k+2];
+                       jnr4    = jjnr[k+3];
+                       
+            j3      = 3*jnr;  
+                       j23             = 3*jnr2;
+                       j33     = 3*jnr3;
+                       j43     = 3*jnr4;
+                       
+                       xmm1    = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));  /* x1 y1 - - */
+                       xmm2    = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23)); /* x2 y2 - - */
+                       xmm3    = _mm_loadh_pi(xmm3, (__m64 *) (pos+j33)); /* x3 y3 - - */
+                       xmm4    = _mm_loadh_pi(xmm4, (__m64 *) (pos+j43)); /* x4 y4 - - */                      
+                       
+                       xmm5    = _mm_load1_ps(pos+j3+2);  
+                       xmm6    = _mm_load1_ps(pos+j23+2); 
+                       xmm7    = _mm_load1_ps(pos+j33+2); 
+                       xmm8    = _mm_load1_ps(pos+j43+2);
+                       
+                       /* transpose */
+                       xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                       xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+                       jz      = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                       xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+                       jx      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+                       jy      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+                       
+                       dx1     = _mm_sub_ps(ix,jx);
+                       dy1     = _mm_sub_ps(iy,jy);
+                       dz1     = _mm_sub_ps(iz,jz);
+                       
+                       t1      = _mm_mul_ps(dx1,dx1);
+                       t2      = _mm_mul_ps(dy1,dy1);
+                       t3      = _mm_mul_ps(dz1,dz1);
+                       
+                       rsq     = _mm_add_ps(t1,t2);
+                       rsq     = _mm_add_ps(rsq,t3);
+                       rinv    = gmx_mm_invsqrt_ps(rsq);
+                       
+                       xmm1    = _mm_load_ss(invsqrta+jnr); 
+                       xmm2    = _mm_load_ss(invsqrta+jnr2);
+                       xmm3    = _mm_load_ss(invsqrta+jnr3);
+                       xmm4    = _mm_load_ss(invsqrta+jnr4);
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+                       xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+                       isaj    = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                       
+                       isaprod = _mm_mul_ps(isai,isaj);
+                       
+                       xmm1    = _mm_load_ss(charge+jnr); 
+                       xmm2    = _mm_load_ss(charge+jnr2); 
+                       xmm3    = _mm_load_ss(charge+jnr3); 
+                       xmm4    = _mm_load_ss(charge+jnr4);
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+                       xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+                       q       = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                       qq      = _mm_mul_ps(iq,q);
+                       
+                       vcoul   = _mm_mul_ps(qq,rinv);
+                       fscal   = _mm_mul_ps(vcoul,rinv);
+                       qq      = _mm_mul_ps(qq,neg);
+                       qq      = _mm_mul_ps(isaprod,qq);
+                       gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
+                       
+            tj      = nti+2*type[jnr];
+                       tj2     = nti+2*type[jnr2];
+                       tj3     = nti+2*type[jnr3];
+                       tj4     = nti+2*type[jnr4];
+                       
+                       xmm1    = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
+                       xmm1    = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
+                       xmm2    = _mm_loadl_pi(xmm2,(__m64 *) (vdwparam+tj3));
+                       xmm2    = _mm_loadh_pi(xmm2,(__m64 *) (vdwparam+tj4));
+                       
+                       xmm3    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2)); /* c61 c62 c61 c62 */
+                       xmm4    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(0,2,0,2)); /* c63 c64 c63 c64 */
+                       c6      = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,1,0,1)); /* c61 c62 c63 c64 */
+                       
+                       xmm3    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /* c121 c122 c121 c122 */
+                       xmm4    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,3,1)); /* c123 c124 c123 c124 */
+                       c12     = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(1,0,1,0)); /* c121 c122 c123 c124 */             
+                               
+                       xmm1    = _mm_load_ss(dvda+jnr); 
+                       xmm2    = _mm_load_ss(dvda+jnr2); 
+                       xmm3    = _mm_load_ss(dvda+jnr3);
+                       xmm4    = _mm_load_ss(dvda+jnr4);
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+                       xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+                       dvdaj   = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                       
+                       /* Calculate GB table index */
+                       r       = _mm_mul_ps(rsq,rinv);
+                       rt      = _mm_mul_ps(r,gbscale);
+                       
+                       n0      = _mm_cvttps_epi32(rt);
+                       n0f     = _mm_cvtepi32_ps(n0);
+                       eps     = _mm_sub_ps(rt,n0f);
+                       
+                       eps2    = _mm_mul_ps(eps,eps);
+                       nnn     = _mm_slli_epi32(n0,2);
+               
+                       /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
+                       xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+                       
+                       Geps    = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+                       
+                       vgb     = _mm_mul_ps(qq,VV);
+                       fijC    = _mm_mul_ps(qq,FF);
+                       fijC    = _mm_mul_ps(fijC,gbscale);
+                       
+                       dvdatmp = _mm_mul_ps(fijC,r);
+                       dvdatmp = _mm_add_ps(vgb,dvdatmp);
+                       dvdatmp = _mm_mul_ps(half,dvdatmp);
+                       dvdatmp = _mm_mul_ps(neg,dvdatmp);
+                       
+                       dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+                       
+                       xmm1    = _mm_mul_ps(dvdatmp,isaj);
+                       xmm1    = _mm_mul_ps(xmm1,isaj);
+                       dvdaj   = _mm_add_ps(dvdaj,xmm1);
+                       
+                       _mm_store_ss(dvda+jnr,dvdaj);
+                       xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
+                       _mm_store_ss(dvda+jnr2,xmm1);
+                       xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2)); 
+                       _mm_store_ss(dvda+jnr3,xmm1);
+                       xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(2,1,0,3));
+                       _mm_store_ss(dvda+jnr4,xmm1);
+                       
+                       vctot   = _mm_add_ps(vctot,vcoul);
+                       vgbtot  = _mm_add_ps(vgbtot,vgb);
+                       
+                       /* Calculate VDW table index */
+                       rt      = _mm_mul_ps(r,tabscale_sse);
+                       n0      = _mm_cvttps_epi32(rt);
+                       n0f     = _mm_cvtepi32_ps(n0);
+                       eps     = _mm_sub_ps(rt,n0f);
+                       eps2    = _mm_mul_ps(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
+
+                       /* Tabulated VdW interaction - disperion */                     
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+                       
+                       Geps    = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+
+                       Vvdw6   = _mm_mul_ps(c6,VV);
+                       fijD    = _mm_mul_ps(c6,FF);
+                       
+                       /* Tabulated VdW interaction - repulsion */
+                       nnn     = _mm_add_epi32(nnn,four);
+                       
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+                       
+                       Geps    = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+                       
+                       Vvdw12  = _mm_mul_ps(c12,VV);
+                       fijR    = _mm_mul_ps(c12,FF);
+                       
+                       Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
+                       Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
+                       
+                       xmm1    = _mm_add_ps(fijD,fijR);
+                       xmm1    = _mm_mul_ps(xmm1,tabscale_sse);
+                       xmm1    = _mm_add_ps(xmm1,fijC);
+                       xmm1    = _mm_sub_ps(xmm1,fscal);
+                       fscal   = _mm_mul_ps(xmm1,neg);
+                       fscal   = _mm_mul_ps(fscal,rinv);
+       
+                       t1      = _mm_mul_ps(fscal,dx1);
+                       t2      = _mm_mul_ps(fscal,dy1);
+                       t3      = _mm_mul_ps(fscal,dz1);
+                       
+                       fix     = _mm_add_ps(fix,t1);
+                       fiy     = _mm_add_ps(fiy,t2);
+                       fiz     = _mm_add_ps(fiz,t3);
+                       
+                       xmm1    = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));  /* fx1 fy1 - - */
+                       xmm2    = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); /* fx1 fy1 - - */
+                       xmm3    = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33)); /* fx3 fy3 - - */
+                       xmm4    = _mm_loadh_pi(xmm4, (__m64 *) (faction+j43)); /* fx4 fy4 - - */
+                       
+                       xmm5    = _mm_load1_ps(faction+j3+2); 
+                       xmm6    = _mm_load1_ps(faction+j23+2);
+                       xmm7    = _mm_load1_ps(faction+j33+2);
+                       xmm8    = _mm_load1_ps(faction+j43+2);
+                       
+                       /* transpose the forces */
+                       xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                       xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+                       xmm7    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                       xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+                       
+                       xmm5    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+                       xmm6    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+                       
+                       /* subtract partial terms */
+                       xmm5    = _mm_sub_ps(xmm5,t1);
+                       xmm6    = _mm_sub_ps(xmm6,t2);
+                       xmm7    = _mm_sub_ps(xmm7,t3);
+                       
+                       xmm1    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
+                       xmm2    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
+                       xmm2    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
+                       
+                       /* store fx's and fy's */
+                       _mm_storel_pi( (__m64 *) (faction+j3),xmm1);
+                       _mm_storeh_pi( (__m64 *) (faction+j23),xmm1);
+                       _mm_storel_pi( (__m64 *) (faction+j33),xmm2);
+                       _mm_storeh_pi( (__m64 *) (faction+j43),xmm2);
+                       
+                       /* ..then z */
+                       _mm_store_ss(faction+j3+2,xmm7);
+                       xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                       _mm_store_ss(faction+j23+2,xmm7);
+                       xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                       _mm_store_ss(faction+j33+2,xmm7);
+                       xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                       _mm_store_ss(faction+j43+2,xmm7);       
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-                       H       = _mm_mul_pd(H,eps2);
-                       Fp      = _mm_add_pd(F,G);
-                       Fp      = _mm_add_pd(Fp,H);
-                       VV      = _mm_mul_pd(Fp,eps);
-                       VV      = _mm_add_pd(Y,VV);
-                       xmm1    = _mm_mul_pd(two,H);
-                       FF      = _mm_add_pd(Fp,G);
-                       FF      = _mm_add_pd(FF,xmm1);
-                       
-                       vvdw6   = _mm_mul_pd(c6,VV);
-                       fijD    = _mm_mul_pd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_pd(G,eps);
-                       H       = _mm_mul_pd(H,eps2);
-                       Fp      = _mm_add_pd(F,G);
-                       Fp      = _mm_add_pd(Fp,H);
-                       VV      = _mm_mul_pd(Fp,eps);
-                       VV      = _mm_add_pd(Y,VV);
-                       xmm1    = _mm_mul_pd(two,H);
-                       FF      = _mm_add_pd(Fp,G);
-                       FF      = _mm_add_pd(FF,xmm1);
-                       
-                       vvdw12  = _mm_mul_pd(c12,VV);
-                       fijR    = _mm_mul_pd(c12,FF);
-                       
-                       vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
-                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-            
-                       xmm1    = _mm_add_pd(fijD,fijR);
-                       xmm1    = _mm_mul_pd(xmm1,tabscale);
-                       xmm1    = _mm_add_pd(xmm1,fijC);
-                       xmm1    = _mm_sub_pd(xmm1,fscal);
-                       fscal   = _mm_mul_pd(xmm1,neg);
-                       fscal   = _mm_mul_pd(fscal,rinv);
-            
-            /***********************************/
-                       /*  INTERACTION SECTION ENDS HERE  */
-                       /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_pd(fscal,dx);
-            ty           = _mm_mul_pd(fscal,dy);
-            tz           = _mm_mul_pd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_pd(fix,tx);
-            fiy          = _mm_add_pd(fiy,ty);
-            fiz          = _mm_add_pd(fiz,tz);
-            
-            /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
-               
-               /* In double precision, offset can only be either 0 or 1 */
-               if(k<nj1)
+        
+               if(offset!=0)
                {
-                       jnrA    = jjnr[k];
-                       j3A     = jnrA * 3;
-            
-            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-            
-                       dx           = _mm_sub_sd(ix,jx);
-                       dy           = _mm_sub_sd(iy,jy);
-                       dz           = _mm_sub_sd(iz,jz);
-            
-            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
-            
-            rinv         = gmx_mm_invsqrt_pd(rsq);
-                       rinvsq       = _mm_mul_sd(rinv,rinv);
-            
-                       /***********************************/
-                       /* INTERACTION SECTION STARTS HERE */
-                       /***********************************/
-                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
-                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-            
-            /* Lennard-Jones */
-            tjA          = nti+2*type[jnrA];
-            
-            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
-                       
-                       isaprod      = _mm_mul_sd(isai,isaj);
-                       qq           = _mm_mul_sd(iq,jq);            
-                       vcoul        = _mm_mul_sd(qq,rinv);
-                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
-            vctot        = _mm_add_sd(vctot,vcoul);
-            
-            /* Polarization interaction */
-                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
-                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
-            
-                       /* Calculate GB table index */
-                       r            = _mm_mul_sd(rsq,rinv);
-                       rtab         = _mm_mul_sd(r,gbscale);
-                       
-                       n0                   = _mm_cvttpd_epi32(rtab);
-                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
-                       nnn                  = _mm_slli_epi32(n0,2);
-                       
-            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
-            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
-            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
-            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
-            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
-            vgb     = _mm_mul_sd(Y, qq);           
-            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-            
-            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-            
-            vgbtot  = _mm_add_sd(vgbtot, vgb);
-            
-            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
-            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-            
-            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
-                       
-            /* Calculate VDW table index */
-                       rtab    = _mm_mul_sd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rtab);
-                       eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
-                       eps2    = _mm_mul_sd(eps,eps);
+                       if(offset==1)
+                       {
+                               jnr   = jjnr[k];
+                               j3    = jnr*3;
+                               tj        = nti+2*type[jnr];
+                               
+                               xmm1  = _mm_loadl_pi(xmm1, (__m64 *) (pos+j3));
+                               xmm5  = _mm_load1_ps(pos+j3+2);
+                               
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+                               
+                               isaj  = _mm_load_ss(invsqrta+jnr);
+                               dvdaj = _mm_load_ss(dvda+jnr);
+                               q     = _mm_load_ss(charge+jnr);
+                               c6    = _mm_load_ss(vdwparam+tj);
+                               c12   = _mm_load_ss(vdwparam+tj+1);
+                               
+                               mask  =  _mm_set_epi32(0,0,0,0xffffffff);
+                               
+                       }
+                       else if(offset==2)
+                       {
+                               jnr   = jjnr[k];
+                               jnr2  = jjnr[k+1];
+                               
+                               j3    = jnr*3;
+                               j23   = jnr2*3;
+                               
+                               tj        = nti+2*type[jnr];
+                               tj2       = nti+2*type[jnr2];
+                               
+                               xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));
+                               xmm2  = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23));
+                               
+                               xmm5  = _mm_load1_ps(pos+j3+2);
+                               xmm6  = _mm_load1_ps(pos+j23+2);
+                               
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+                               
+                               xmm1  = _mm_load_ss(invsqrta+jnr);
+                               xmm2  = _mm_load_ss(invsqrta+jnr2);
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+                               isaj  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_load_ss(dvda+jnr);
+                               xmm2  = _mm_load_ss(dvda+jnr2);
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+                               dvdaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_load_ss(charge+jnr);
+                               xmm2  = _mm_load_ss(charge+jnr2);
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+                               q     = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
+                               xmm1  = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
+                               
+                               c6    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               c12   = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+                               
+                               mask  = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+                       }
+                       else
+                       {
+                               jnr   = jjnr[k];
+                               jnr2  = jjnr[k+1];
+                               jnr3  = jjnr[k+2];
+                               
+                               j3    = jnr*3;
+                               j23   = jnr2*3;
+                               j33   = jnr3*3;
+                               
+                               tj        = nti+2*type[jnr];
+                               tj2       = nti+2*type[jnr2];
+                               tj3       = nti+2*type[jnr3];
+                               
+                               xmm1  = _mm_loadh_pi(xmm1,(__m64 *) (pos+j3)); 
+                               xmm2  = _mm_loadh_pi(xmm2,(__m64 *) (pos+j23)); 
+                               xmm3  = _mm_loadh_pi(xmm3,(__m64 *) (pos+j33)); 
+                               
+                               xmm5  = _mm_load1_ps(pos+j3+2); 
+                               xmm6  = _mm_load1_ps(pos+j23+2); 
+                               xmm7  = _mm_load1_ps(pos+j33+2); 
+                               
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); 
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));        
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); 
+                               xmm2  = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2)); 
+                               
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); 
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); 
+                               
+                               xmm1  = _mm_load_ss(invsqrta+jnr);
+                               xmm2  = _mm_load_ss(invsqrta+jnr2);
+                               xmm3  = _mm_load_ss(invsqrta+jnr3);
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
+                               xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
+                               isaj  = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_load_ss(dvda+jnr);
+                               xmm2  = _mm_load_ss(dvda+jnr2);
+                               xmm3  = _mm_load_ss(dvda+jnr3);
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
+                               xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
+                               dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_load_ss(charge+jnr);
+                               xmm2  = _mm_load_ss(charge+jnr2);
+                               xmm3  = _mm_load_ss(charge+jnr3);
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
+                               xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
+                               q     = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_loadl_pi(xmm1, (__m64 *) (vdwparam+tj)); 
+                               xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (vdwparam+tj2));
+                               xmm2  = _mm_loadl_pi(xmm2, (__m64 *) (vdwparam+tj3));
+                               
+                               xmm3  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2));
+                               c6    = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,0,0,1));
+                               
+                               xmm3  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+                               c12   = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,1,1,0));
+                               
+                               mask  = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+                       }
+                       
+                       jx      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
+                       jy      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
+                       jz      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
+                       
+                       c6      = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
+                       c12     = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
+                       dvdaj   = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
+                       isaj    = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);                 
+                       q       = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
+                       
+                       dx1     = _mm_sub_ps(ix,jx);
+                       dy1     = _mm_sub_ps(iy,jy);
+                       dz1     = _mm_sub_ps(iz,jz);
+                       
+                       t1      = _mm_mul_ps(dx1,dx1);
+                       t2      = _mm_mul_ps(dy1,dy1);
+                       t3      = _mm_mul_ps(dz1,dz1);
+                       
+                       rsq     = _mm_add_ps(t1,t2);
+                       rsq     = _mm_add_ps(rsq,t3);
+                       
+                       rinv    = gmx_mm_invsqrt_ps(rsq);
+                       
+                       isaprod = _mm_mul_ps(isai,isaj);
+                       qq      = _mm_mul_ps(iq,q);
+                       vcoul   = _mm_mul_ps(qq,rinv);
+                       fscal   = _mm_mul_ps(vcoul,rinv);
+                       
+                       qq      = _mm_mul_ps(qq,neg);
+                       qq      = _mm_mul_ps(isaprod,qq);
+                       
+                       gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
+                       rinvsq  = _mm_mul_ps(rinv,rinv);
+                       r       = _mm_mul_ps(rsq,rinv);
+                       rt      = _mm_mul_ps(r,gbscale);
+                       
+                       n0      = _mm_cvttps_epi32(rt);
+                       n0f     = _mm_cvtepi32_ps(n0);
+                       eps     = _mm_sub_ps(rt,n0f);
+                       
+                       eps2    = _mm_mul_ps(eps,eps);
+                       nnn     = _mm_slli_epi32(n0,2);
+                       
+                       /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
+                       xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); 
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); 
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); 
+                       
+                       Geps = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+                       vgb     = _mm_mul_ps(qq,VV);
+                       fijC    = _mm_mul_ps(qq,FF);
+                       fijC    = _mm_mul_ps(fijC,gbscale);
+                       
+                       dvdatmp = _mm_mul_ps(fijC,r);
+                       dvdatmp = _mm_add_ps(vgb,dvdatmp);
+                       dvdatmp = _mm_mul_ps(half,dvdatmp);
+                       dvdatmp = _mm_mul_ps(neg,dvdatmp);
+                       
+                       dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+                       
+                       xmm1    = _mm_mul_ps(dvdatmp,isaj);
+                       xmm1    = _mm_mul_ps(xmm1,isaj);
+                       dvdaj   = _mm_add_ps(dvdaj,xmm1);
+                       
+                       vcoul   = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
+                       vgb     = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
+                       
+                       vctot   = _mm_add_ps(vctot,vcoul);
+                       vgbtot  = _mm_add_ps(vgbtot,vgb);
+                                               
+                       /* Calculate VDW table index */
+                       rt      = _mm_mul_ps(r,tabscale_sse);
+                       n0      = _mm_cvttps_epi32(rt);
+                       n0f     = _mm_cvtepi32_ps(n0);
+                       eps     = _mm_sub_ps(rt,n0f);
+                       eps2    = _mm_mul_ps(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-                       H       = _mm_mul_sd(H,eps2);
-                       Fp      = _mm_add_sd(F,G);
-                       Fp      = _mm_add_sd(Fp,H);
-                       VV      = _mm_mul_sd(Fp,eps);
-                       VV      = _mm_add_sd(Y,VV);
-                       xmm1    = _mm_mul_sd(two,H);
-                       FF      = _mm_add_sd(Fp,G);
-                       FF      = _mm_add_sd(FF,xmm1);
-                       
-                       vvdw6   = _mm_mul_sd(c6,VV);
-                       fijD    = _mm_mul_sd(c6,FF);
-            
-            /* Dispersion */
-            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
-            F            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(Y,F);
-            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
-            H            = _mm_setzero_pd();
-            GMX_MM_TRANSPOSE2_PD(G,H);
-            
-            G       = _mm_mul_sd(G,eps);
-                       H       = _mm_mul_sd(H,eps2);
-                       Fp      = _mm_add_sd(F,G);
-                       Fp      = _mm_add_sd(Fp,H);
-                       VV      = _mm_mul_sd(Fp,eps);
-                       VV      = _mm_add_sd(Y,VV);
-                       xmm1    = _mm_mul_sd(two,H);
-                       FF      = _mm_add_sd(Fp,G);
-                       FF      = _mm_add_sd(FF,xmm1);
-                       
-                       vvdw12  = _mm_mul_sd(c12,VV);
-                       fijR    = _mm_mul_sd(c12,FF);
-                       
-                       vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
-                       vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
-            
-                       xmm1    = _mm_add_sd(fijD,fijR);
-                       xmm1    = _mm_mul_sd(xmm1,tabscale);
-                       xmm1    = _mm_add_sd(xmm1,fijC);
-                       xmm1    = _mm_sub_sd(xmm1,fscal);
-                       fscal   = _mm_mul_sd(xmm1,neg);
-                       fscal   = _mm_mul_sd(fscal,rinv);
-
-            /***********************************/
-                       /*  INTERACTION SECTION ENDS HERE  */
-                       /***********************************/
-            
-            /* Calculate temporary vectorial force */
-            tx           = _mm_mul_sd(fscal,dx);
-            ty           = _mm_mul_sd(fscal,dy);
-            tz           = _mm_mul_sd(fscal,dz);
-            
-            /* Increment i atom force */
-            fix          = _mm_add_sd(fix,tx);
-            fiy          = _mm_add_sd(fiy,ty);
-            fiz          = _mm_add_sd(fiz,tz);
-            
-            /* Store j forces back */
-                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
+                       /* Tabulated VdW interaction - disperion */     
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+               
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+                       
+                       Geps    = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+                       
+                       Vvdw6   = _mm_mul_ps(c6,VV);
+                       fijD    = _mm_mul_ps(c6,FF);
+                       
+                       /* Tabulated VdW interaction - repulsion */
+                       nnn     = _mm_add_epi32(nnn,four);
+                                       
+                       xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
+                       xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
+                       xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
+                       xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
+                       
+                       /* transpose 4*4 */
+                       xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+                       xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+                       xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+                       xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+                       
+                       Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+                       F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+                       G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+                       H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+                       
+                       Geps    = _mm_mul_ps(G,eps);
+                       Heps2   = _mm_mul_ps(H,eps2);
+                       Fp      = _mm_add_ps(F,Geps);
+                       Fp      = _mm_add_ps(Fp,Heps2);
+                       VV      = _mm_mul_ps(Fp,eps);
+                       VV      = _mm_add_ps(Y,VV);
+                       xmm1    = _mm_mul_ps(two,Heps2);
+                       FF      = _mm_add_ps(Fp,Geps);
+                       FF      = _mm_add_ps(FF,xmm1);
+                       
+                       Vvdw12  = _mm_mul_ps(c12,VV);
+                       fijR    = _mm_mul_ps(c12,FF);
+                       
+                       Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
+                       Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
+                       
+                       xmm1    = _mm_add_ps(fijD,fijR);
+                       xmm1    = _mm_mul_ps(xmm1,tabscale_sse);
+                       xmm1    = _mm_add_ps(xmm1,fijC);
+                       xmm1    = _mm_sub_ps(xmm1,fscal);
+                       fscal   = _mm_mul_ps(xmm1,neg);
+                       fscal   = _mm_mul_ps(fscal,rinv);                       
+                       
+                       t1      = _mm_mul_ps(fscal,dx1);
+                       t2      = _mm_mul_ps(fscal,dy1);
+                       t3      = _mm_mul_ps(fscal,dz1);
+                       
+                       if(offset==1)
+                       {
+                               _mm_store_ss(dvda+jnr,dvdaj);
+                               
+                               xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (faction+j3));
+                               xmm7 = _mm_load1_ps(faction+j3+2);
+                               
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(1,1,1,1));
+                               
+                               xmm5 = _mm_sub_ps(xmm5,t1);
+                               xmm6 = _mm_sub_ps(xmm6,t2);
+                               xmm7 = _mm_sub_ps(xmm7,t3);
+                               
+                               _mm_store_ss(faction+j3 , xmm5);
+                               _mm_store_ss(faction+j3+1,xmm6);
+                               _mm_store_ss(faction+j3+2,xmm7);
+                       }
+                       else if(offset==2)
+                       {
+                               _mm_store_ss(dvda+jnr,dvdaj);
+                               xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
+                               _mm_store_ss(dvda+jnr2,xmm1);
+                               
+                               xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
+                               xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); 
+                               
+                               xmm5 = _mm_load1_ps(faction+j3+2); 
+                               xmm6 = _mm_load1_ps(faction+j23+2);
+                               
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); 
+                               xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); 
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); 
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); 
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); 
+                               
+                               xmm5 = _mm_sub_ps(xmm5, t1);
+                               xmm6 = _mm_sub_ps(xmm6, t2);
+                               xmm7 = _mm_sub_ps(xmm7, t3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); 
+                               
+                               _mm_storel_pi( (__m64 *) (faction+j3), xmm5);
+                               _mm_storeh_pi( (__m64 *) (faction+j23), xmm5);
+                               
+                               _mm_store_ss(faction+j3+2,xmm7);
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(faction+j23+2,xmm7);
+                       }
+                       else
+                       {
+                               _mm_store_ss(dvda+jnr,dvdaj);
+                               xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
+                               _mm_store_ss(dvda+jnr2,xmm1);
+                               xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2)); 
+                               _mm_store_ss(dvda+jnr3,xmm1);
+                               
+                               xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3)); 
+                               xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));  
+                               xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33));
+                               
+                               xmm5 = _mm_load1_ps(faction+j3+2);
+                               xmm6 = _mm_load1_ps(faction+j23+2); 
+                               xmm7 = _mm_load1_ps(faction+j33+2); 
+                               
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); 
+                               xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0)); 
+                               xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); 
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); 
+                               xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2)); 
+                               
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); 
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); 
+                               
+                               xmm5 = _mm_sub_ps(xmm5, t1);
+                               xmm6 = _mm_sub_ps(xmm6, t2);
+                               xmm7 = _mm_sub_ps(xmm7, t3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
+                               xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); 
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); 
+                               xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); 
+                               
+                               _mm_storel_pi( (__m64 *) (faction+j3), xmm1);
+                               _mm_storeh_pi( (__m64 *) (faction+j23), xmm1);
+                               _mm_storel_pi( (__m64 *) (faction+j33), xmm2);
+                               
+                               _mm_store_ss(faction+j3+2,xmm7); 
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(faction+j23+2,xmm7); 
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(faction+j33+2,xmm7); 
+                       }
+                       
+                       t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
+                       t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
+                       t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
+                       
+                       fix = _mm_add_ps(fix,t1);
+                       fiy = _mm_add_ps(fiy,t2);
+                       fiz = _mm_add_ps(fiz,t3);
                }
                
-        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
-        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+               t1      = _mm_movehl_ps(t1,fix);
+               t2      = _mm_movehl_ps(t2,fiy);
+               t3      = _mm_movehl_ps(t3,fiz);
+               
+               fix     = _mm_add_ps(fix,t1);
+               fiy     = _mm_add_ps(fiy,t2);
+               fiz     = _mm_add_ps(fiz,t3);
+               
+               t1      = _mm_shuffle_ps(fix,fix,_MM_SHUFFLE(1,1,1,1));
+               t2      = _mm_shuffle_ps(fiy,fiy,_MM_SHUFFLE(1,1,1,1));
+               t3      = _mm_shuffle_ps(fiz,fiz,_MM_SHUFFLE(1,1,1,1));
+               
+               fix     = _mm_add_ps(fix,t1);
+               fiy     = _mm_add_ps(fiy,t2);
+               fiz     = _mm_add_ps(fiz,t3);
+               
+               xmm2    = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
+               xmm2    = _mm_movelh_ps(xmm2,fiz); 
+               xmm2    = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
+               
+               /* load i force from memory */
+               xmm4    = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));
+               xmm5    = _mm_load1_ps(faction+ii3+2);
+               xmm4    = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
+               
+               /* add to i force */
+               xmm4    = _mm_add_ps(xmm4,xmm2);
+               
+               /* store i force to memory */
+               _mm_storel_pi( (__m64 *) (faction+ii3),xmm4);
+               xmm4    = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
+               _mm_store_ss(faction+ii3+2,xmm4);
+               
+    /* Load, add and store i shift forces */
+    xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (fshift+is3));
+    xmm5 = _mm_load1_ps(fshift+is3+2);
+    xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
+      
+    xmm4 = _mm_add_ps(xmm4,xmm2);
+      
+    _mm_storel_pi( (__m64 *) (fshift+is3),xmm4);
+    xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
+    _mm_store_ss(fshift+is3+2,xmm4);
+      
+    /* Coulomb potential */
+    ggid             = gid[n];         
+               
+               vcoul   = _mm_movehl_ps(vcoul,vctot);
+               vctot   = _mm_add_ps(vctot,vcoul);
+               vcoul   = _mm_shuffle_ps(vctot,vctot,_MM_SHUFFLE(1,1,1,1));
+               vctot   = _mm_add_ss(vctot,vcoul);
+               
+               _mm_store_ss(&vct,vctot);
+               Vc[ggid] = Vc[ggid] + vct;
+               
+               Vvdwtmp  = _mm_movehl_ps(Vvdwtmp,Vvdwtot);
+               Vvdwtot  = _mm_add_ps(Vvdwtot,Vvdwtmp);
+               Vvdwtmp  = _mm_shuffle_ps(Vvdwtot,Vvdwtot,_MM_SHUFFLE(1,1,1,1));
+               Vvdwtot  = _mm_add_ss(Vvdwtot,Vvdwtmp);
+               
+               _mm_store_ss(&vdwt,Vvdwtot);
+               Vvdw[ggid] = Vvdw[ggid] + vdwt;
+               
+               /* dvda */
+               dvdatmp = _mm_movehl_ps(dvdatmp,dvdasum);
+               dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+               dvdatmp = _mm_shuffle_ps(dvdasum,dvdasum,_MM_SHUFFLE(1,1,1,1));
+               dvdasum = _mm_add_ss(dvdasum,dvdatmp);
+               
+               _mm_store_ss(&dva,dvdasum);
+               dvda[ii] = dvda[ii] + dva*isai_f*isai_f;
+               
+               /* Store the GB potential to the work array */
+               vgb     = _mm_movehl_ps(vgb,vgbtot);
+               vgbtot  = _mm_add_ps(vgbtot,vgb);
+               vgb     = _mm_shuffle_ps(vgbtot,vgbtot,_MM_SHUFFLE(1,1,1,1));
+               vgbtot  = _mm_add_ss(vgbtot,vgb);
+               
+               _mm_store_ss(&vgbt,vgbtot);
+               gpol[ggid] = gpol[ggid] + vgbt;
+    }
+       
+       *outeriter       = nri;            
+    *inneriter       = nj1;            
+}
+
+
+/*
+ * Gromacs nonbonded kernel nb_kernel430nf
+ * Coulomb interaction:     Generalized-Born
+ * VdW interaction:         Tabulated
+ * water optimization:      No
+ * Calculate forces:        no
+ */
+void nb_kernel430nf_x86_64_sse(
+                    int *           p_nri,
+                    int *           iinr,
+                    int *           jindex,
+                    int *           jjnr,
+                    int *           shift,
+                    float *         shiftvec,
+                    float *         fshift,
+                    int *           gid,
+                    float *         pos,
+                    float *         faction,
+                    float *         charge,
+                    float *         p_facel,
+                    float *         p_krf,
+                    float *         p_crf,
+                    float *         Vc,
+                    int *           type,
+                    int *           p_ntype,
+                    float *         vdwparam,
+                    float *         Vvdw,
+                    float *         p_tabscale,
+                    float *         VFtab,
+                    float *         invsqrta,
+                    float *         dvda,
+                    float *         p_gbtabscale,
+                    float *         GBtab,
+                    int *           p_nthreads,
+                    int *           count,
+                    void *          mtx,
+                    int *           outeriter,
+                    int *           inneriter,
+                    float *         work)
+{
+    int           nri,ntype,nthreads;
+    float         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+    int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+    float         shX,shY,shZ;
+    float         iq;
+    float         qq,vcoul,vctot;
+    int           nti;
+    int           tj;
+    float         Vvdw6,Vvdwtot;
+    float         Vvdw12;
+    float         r,rt,eps,eps2;
+    int           n0,nnn;
+    float         Y,F,Geps,Heps2,Fp,VV;
+    float         isai,isaj,isaprod,gbscale;
+    float         ix1,iy1,iz1;
+    float         jx1,jy1,jz1;
+    float         dx11,dy11,dz11,rsq11,rinv11;
+    float         c6,c12;
+
+    nri              = *p_nri;         
+    ntype            = *p_ntype;       
+    nthreads         = *p_nthreads;    
+    facel            = *p_facel;       
+    krf              = *p_krf;         
+    crf              = *p_crf;         
+    tabscale         = *p_tabscale;    
+    gbtabscale       = *p_gbtabscale;  
+    nj1              = 0;              
+    
+    for(n=0; (n<nri); n++)
+    {
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
+        ix1              = shX + pos[ii3+0];
+        iy1              = shY + pos[ii3+1];
+        iz1              = shZ + pos[ii3+2];
+        iq               = facel*charge[ii];
+        isai             = invsqrta[ii];   
+        nti              = 2*ntype*type[ii];
+        vctot            = 0;              
+        Vvdwtot          = 0;              
         
-        ggid     = gid[n];         
+        for(k=nj0; (k<nj1); k++)
+        {
+            jnr              = jjnr[k];        
+            j3               = 3*jnr;          
+            jx1              = pos[j3+0];      
+            jy1              = pos[j3+1];      
+            jz1              = pos[j3+2];      
+            dx11             = ix1 - jx1;      
+            dy11             = iy1 - jy1;      
+            dz11             = iz1 - jz1;      
+            rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
+            rinv11           = gmx_invsqrt(rsq11);
+            isaj             = invsqrta[jnr];  
+            isaprod          = isai*isaj;      
+            qq               = iq*charge[jnr]; 
+            vcoul            = qq*rinv11;      
+            qq               = isaprod*(-qq);  
+            gbscale          = isaprod*gbtabscale;
+            tj               = nti+2*type[jnr];
+            c6               = vdwparam[tj];   
+            c12              = vdwparam[tj+1]; 
+            r                = rsq11*rinv11;   
+            rt               = r*gbscale;      
+            n0               = rt;             
+            eps              = rt-n0;          
+            eps2             = eps*eps;        
+            nnn              = 4*n0;           
+            Y                = GBtab[nnn];     
+            F                = GBtab[nnn+1];   
+            Geps             = eps*GBtab[nnn+2];
+            Heps2            = eps2*GBtab[nnn+3];
+            Fp               = F+Geps+Heps2;   
+            VV               = Y+eps*Fp;       
+            vgb              = qq*VV;          
+            vctot            = vctot + vcoul;  
+            r                = rsq11*rinv11;   
+            rt               = r*tabscale;     
+            n0               = rt;             
+            eps              = rt-n0;          
+            eps2             = eps*eps;        
+            nnn              = 8*n0;           
+            Y                = VFtab[nnn];     
+            F                = VFtab[nnn+1];   
+            Geps             = eps*VFtab[nnn+2];
+            Heps2            = eps2*VFtab[nnn+3];
+            Fp               = F+Geps+Heps2;   
+            VV               = Y+eps*Fp;       
+            Vvdw6            = c6*VV;          
+            nnn              = nnn+4;          
+            Y                = VFtab[nnn];     
+            F                = VFtab[nnn+1];   
+            Geps             = eps*VFtab[nnn+2];
+            Heps2            = eps2*VFtab[nnn+3];
+            Fp               = F+Geps+Heps2;   
+            VV               = Y+eps*Fp;       
+            Vvdw12           = c12*VV;         
+            Vvdwtot          = Vvdwtot+ Vvdw6 + Vvdw12;
+        }
         
-        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
-        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
-       }
+        ggid             = gid[n];         
+        Vc[ggid]         = Vc[ggid] + vctot;
+        Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
+    }
     
-       *outeriter   = nri;            
-    *inneriter   = nj1;        
+    *outeriter       = nri;            
+    *inneriter       = nj1;            
 }
 
+
index 3e00f4d6710b86aea0227cdd243df08894a841d5..ec24e41fc0c8c41bd97230570f0285649b1b5cca 100644 (file)
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
+#include "nb_kernel400_ia32_sse2.h"
+
 
-       
 void nb_kernel400_ia32_sse2(int *           p_nri,
-                    int *           iinr,
-                    int *           jindex,
-                    int *           jjnr,
-                    int *           shift,
-                    double *         shiftvec,
-                    double *         fshift,
-                    int *           gid,
-                    double *         pos,
-                    double *         faction,
-                    double *         charge,
-                    double *         p_facel,
-                    double *         p_krf,
-                    double *         p_crf,
-                    double *         Vc,
-                    int *           type,
-                    int *           p_ntype,
-                    double *         vdwparam,
-                    double *         Vvdw,
-                    double *         p_tabscale,
-                    double *         VFtab,
-                    double *         invsqrta,
-                    double *         dvda,
-                    double *         p_gbtabscale,
-                    double *         GBtab,
-                    int *           p_nthreads,
-                    int *           count,
-                    void *          mtx,
-                    int *           outeriter,
-                    int *           inneriter,
-                    double *         work)
+                              int *           iinr,
+                              int *           jindex,
+                              int *           jjnr,
+                              int *           shift,
+                              double *         shiftvec,
+                              double *         fshift,
+                              int *           gid,
+                              double *         pos,
+                              double *         faction,
+                              double *         charge,
+                              double *         p_facel,
+                              double *         p_krf,
+                              double *         p_crf,
+                              double *         vc,
+                              int *           type,
+                              int *           p_ntype,
+                              double *         vdwparam,
+                              double *         vvdw,
+                              double *         p_tabscale,
+                              double *         VFtab,
+                              double *         invsqrta,
+                              double *         dvda,
+                              double *         p_gbtabscale,
+                              double *         GBtab,
+                              int *           p_nthreads,
+                              int *           count,
+                              void *          mtx,
+                              int *           outeriter,
+                              int *           inneriter,
+                              double *         work)
 {
-       int           nri,ntype,nthreads,offset;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
-       double        shX,shY,shZ,isai_d,dva;
+    int           nri,nthreads;
+    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+    double        shX,shY,shZ;
+    int           jnrA,jnrB;
+    int           j3A,j3B;
        gmx_gbdata_t *gbdata;
-       double *        gpol;
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  facel,gbtabscale,dvdaj;
+       __m128i  n0, nnn;
        
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       fac,tabscale,gbtabscale,gbfactor;
-       __m128i       n0,nnn;
-       
-       const __m128d neg    = {-1.0f,-1.0f};
-       const __m128d zero   = {0.0f,0.0f};
-       const __m128d half   = {0.5f,0.5f};
-       const __m128d two    = {2.0f,2.0f};
-       const __m128d three  = {3.0f,3.0f};
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
-
+    
        nri        = *p_nri;
-       ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-  facel      = *p_facel;    
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
-       
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-  gbfactor   = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-               
-       /* Keep compiler happy */
-       dvdatmp = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       
-       jnr1=jnr2=0;
-       j13=j23=0;
+    
+    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+    gbtabscale = _mm_load1_pd(p_gbtabscale);  
+    facel      = _mm_load1_pd(p_facel);
+    
+    nj1         = 0;
+    jnrA = jnrB = 0;
+    j3A = j3B   = 0;
+    jx          = _mm_setzero_pd();
+    jy          = _mm_setzero_pd();
+    jz          = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
                
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
-               
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
-               
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
-               
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               
-               for(k=nj0;k<nj1-offset; k+=2)
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+        
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+        
+               isai             = _mm_load1_pd(invsqrta+ii);
+                       
+               vctot            = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+                
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                                               
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                                               
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-                       qq              = _mm_mul_pd(qq,neg);
-      qq      = _mm_mul_pd(qq,gbfactor);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
                        
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                                       
-                       fscal   = _mm_sub_pd(fijC,fscal);
-                       fscal   = _mm_mul_pd(fscal,neg);
-                       fscal   = _mm_mul_pd(fscal,rinv);
-                                               
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storeh_pd(faction+j23+2,xmm7);
-               }
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+            
+            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+            
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+            
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+            vctot        = _mm_add_pd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_pd(G,eps);
+            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+            vgb     = _mm_mul_pd(Y, qq);           
+            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+            
+            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
 
+            vgbtot  = _mm_add_pd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+                                   
+            fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_pd(fscal,dx);
+            ty           = _mm_mul_pd(fscal,dy);
+            tz           = _mm_mul_pd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_pd(fix,tx);
+            fiy          = _mm_add_pd(fiy,ty);
+            fiz          = _mm_add_pd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
+               }
+               
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                                               
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
+                       jnrA    = jjnr[k];
+                       j3A     = jnrA * 3;
             
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-                       qq              = _mm_mul_sd(qq,neg);
-      qq      = _mm_mul_sd(qq,gbfactor);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                                               
-                       fscal   = _mm_sub_sd(fijC,fscal);
-                       fscal   = _mm_mul_sd(fscal,neg);
-                       fscal   = _mm_mul_sd(fscal,rinv);
-                                                               
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                                               
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
+            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+            
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+            
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+                               
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(iq,jq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+            vctot        = _mm_add_sd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_sd(G,eps);
+            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+            vgb     = _mm_mul_sd(Y, qq);           
+            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_sd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
                        
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+            fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_sd(fscal,dx);
+            ty           = _mm_mul_sd(fscal,dy);
+            tz           = _mm_mul_sd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_sd(fix,tx);
+            fiy          = _mm_add_sd(fiy,ty);
+            fiz          = _mm_add_sd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-                               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-       
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* Load i shift forces from memory */
-               xmm1     = _mm_load_sd(fshift+is3);
-               xmm2     = _mm_load_sd(fshift+is3+1);
-               xmm3     = _mm_load_sd(fshift+is3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i shift forces to memory */
-               _mm_store_sd(fshift+is3,fix);
-               _mm_store_sd(fshift+is3+1,fiy);
-               _mm_store_sd(fshift+is3+2,fiz);
-                               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
+        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+        
+        ggid     = gid[n];         
+        
+        gmx_mm_update_1pot_pd(vctot,vc+ggid);
+        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
        }
-       
+    
        *outeriter   = nri;            
-    *inneriter   = nj1; 
-       
+    *inneriter   = nj1;        
 }
index 56b7dfe8fd6809d6ba99843325572e4a68891f60..759916a8e620d9905766fe3913b5d11baf484a6d 100644 (file)
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
+#include "nb_kernel410_ia32_sse2.h"
+
+
+
 void nb_kernel410_ia32_sse2(int *           p_nri,
                                                        int *           iinr,
                                                        int *           jindex,
@@ -40,11 +44,11 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                                                        double *         p_facel,
                                                        double *         p_krf,
                                                        double *         p_crf,
-                                                       double *         Vc,
+                                                       double *         vc,
                                                        int *           type,
                                                        int *           p_ntype,
                                                        double *         vdwparam,
-                                                       double *         Vvdw,
+                                                       double *         vvdw,
                                                        double *         p_tabscale,
                                                        double *         VFtab,
                                                        double *         invsqrta,
@@ -58,478 +62,302 @@ void nb_kernel410_ia32_sse2(int *           p_nri,
                                                        int *           inneriter,
                                                        double *         work)
 {
-       int           nri,ntype,nthreads,offset,tj,tj2,nti;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,nt1,nt2;
-       double        shX,shY,shZ,isai_d,dva,vgbt;
+    int           nri,ntype,nthreads;
+    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+    double        shX,shY,shZ;
+       int                       offset,nti;
+    int           jnrA,jnrB;
+    int           j3A,j3B;
+       int           tjA,tjB;
        gmx_gbdata_t *gbdata;
-       double *        gpol;
-
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,rinvsq,rinvsix;
-       __m128d       fac,tabscale,gbtabscale,gbfactor;
-       __m128i       n0,nnn;
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale,c6,c12;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  rinvsix,vvdw6,vvdw12;
+       __m128d  facel,gbtabscale,dvdaj;
+       __m128i  n0, nnn;
        
-       const __m128d neg    = {-1.0f,-1.0f};
-       const __m128d zero   = {0.0f,0.0f};
-       const __m128d half   = {0.5f,0.5f};
-       const __m128d two    = {2.0f,2.0f};
-       const __m128d three  = {3.0f,3.0f};
-       const __m128d six    = {6.0f,6.0f};
-       const __m128d twelwe = {12.0f,12.0f};
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
+       const __m128d six        = _mm_set1_pd(6.0);
+       const __m128d twelve     = _mm_set1_pd(12.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
 
        nri        = *p_nri;
        ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-  facel      = *p_facel;     
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
-       
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-  gbfactor   = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-               
-       /* Keep compiler happy */
-       Vvdwtmp = _mm_setzero_pd();
-       Vvdwtot = _mm_setzero_pd();
-       dvdatmp = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       xmm1    = _mm_setzero_pd();
-       xmm2    = _mm_setzero_pd();
-       xmm3    = _mm_setzero_pd();
-       xmm4    = _mm_setzero_pd();
-       jnr1    = jnr2 = 0;
-       j13     = j23  = 0;
+    
+    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+    gbtabscale = _mm_load1_pd(p_gbtabscale);  
+    facel      = _mm_load1_pd(p_facel);
+
+    nj1         = 0;
+    jnrA = jnrB = 0;
+    j3A = j3B   = 0;
+    jx          = _mm_setzero_pd();
+    jy          = _mm_setzero_pd();
+    jz          = _mm_setzero_pd();
+    c6          = _mm_setzero_pd();
+    c12         = _mm_setzero_pd();
        
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
-               
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
-               
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
-               
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
-               
-               nti      = 2*ntype*type[ii];
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
                
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               Vvdwtot = _mm_setzero_pd();
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+        
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+        
+               isai             = _mm_load1_pd(invsqrta+ii);
+        
+               nti              = 2*ntype*type[ii];
                
-               for(k=nj0;k<nj1-offset; k+=2)
+               vctot            = _mm_setzero_pd();
+               vvdwtot          = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+        
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                       
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-      qq      = _mm_mul_pd(qq,gbfactor);
-                       qq              = _mm_mul_pd(qq,neg);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       tj2     = nti+2*type[jnr2];
-                       
-                       xmm1      = _mm_loadu_pd(vdwparam+tj);
-                       xmm2     = _mm_loadu_pd(vdwparam+tj2);
-                       c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
-                       c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-                                               
-                       rinvsq  = _mm_mul_pd(rinv,rinv);
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
                        
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                       
-                       /* VdW interaction */
-                       rinvsix = _mm_mul_pd(rinvsq,rinvsq);
-                       rinvsix = _mm_mul_pd(rinvsix,rinvsq);
-                       
-                       Vvdw6   = _mm_mul_pd(c6,rinvsix);
-                       Vvdw12  = _mm_mul_pd(c12,rinvsix);
-                       Vvdw12  = _mm_mul_pd(Vvdw12,rinvsix);
-                       Vvdwtmp = _mm_sub_pd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-                       
-                       xmm1    = _mm_mul_pd(twelwe,Vvdw12);
-                       xmm2    = _mm_mul_pd(six,Vvdw6);
-                       xmm1    = _mm_sub_pd(xmm1,xmm2);
-                       xmm1    = _mm_mul_pd(xmm1,rinvsq);
-                       
-                       /* Scalar force */
-                       fscal   = _mm_sub_pd(fijC,fscal);
-                       fscal   = _mm_mul_pd(fscal,rinv);
-                       fscal   = _mm_sub_pd(xmm1,fscal);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storeh_pd(faction+j23+2,xmm7);
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+
+            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+            /* Lennard-Jones */
+            tjA          = nti+2*type[jnrA];
+                       tjB          = nti+2*type[jnrB];
+            
+            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                       
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+            vctot        = _mm_add_pd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_pd(G,eps);
+            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+            vgb     = _mm_mul_pd(Y, qq);           
+            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+            
+            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_pd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+                       
+                       rinvsix      = _mm_mul_pd(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_pd(rinvsix,rinvsq);
+                       
+                       vvdw6        = _mm_mul_pd(c6,rinvsix);
+                       vvdw12       = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
+            
+            fscal        = _mm_sub_pd(_mm_mul_pd(rinvsq, 
+                                                 _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+                                                            _mm_mul_pd(six,vvdw6))),
+                                      _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+                        
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_pd(fscal,dx);
+            ty           = _mm_mul_pd(fscal,dy);
+            tz           = _mm_mul_pd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_pd(fix,tx);
+            fiy          = _mm_add_pd(fiy,ty);
+            fiz          = _mm_add_pd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                       
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-      qq      = _mm_mul_sd(qq,gbfactor);
-                       qq              = _mm_mul_sd(qq,neg);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       
-                       c6      = _mm_load_sd(vdwparam+tj);
-                       c12     = _mm_load_sd(vdwparam+tj+1);
-                                               
-                       rinvsq  = _mm_mul_sd(rinv,rinv);
-                       
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                                               
-                       /* VdW interaction */
-                       rinvsix = _mm_mul_sd(rinvsq,rinvsq);
-                       rinvsix = _mm_mul_sd(rinvsix,rinvsq);
-                       
-                       Vvdw6   = _mm_mul_sd(c6,rinvsix);
-                       Vvdw12  = _mm_mul_sd(c12,rinvsix);
-                       Vvdw12  = _mm_mul_sd(Vvdw12,rinvsix);
-                       Vvdwtmp = _mm_sub_sd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-                       
-                       xmm1    = _mm_mul_sd(twelwe,Vvdw12);
-                       xmm2    = _mm_mul_sd(six,Vvdw6);
-                       xmm1    = _mm_sub_sd(xmm1,xmm2);
-                       xmm1    = _mm_mul_sd(xmm1,rinvsq);
-                       
-                       /* Scalar force */
-                       fscal   = _mm_sub_sd(fijC,fscal);
-                       fscal   = _mm_mul_sd(fscal,rinv);
-                       fscal   = _mm_sub_sd(xmm1,fscal);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
-                       
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+                       jnrA    = jjnr[k];
+                       
+                       j3A     = jnrA * 3;
+            
+            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+            
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+            
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+            
+            /* Lennard-Jones */
+            tjA          = nti+2*type[jnrA];
+            
+            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+                       
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(iq,jq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+            vctot        = _mm_add_sd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_sd(G,eps);
+            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+            vgb     = _mm_mul_sd(Y, qq);           
+            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+            
+            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_sd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+                       
+                       rinvsix      = _mm_mul_sd(rinvsq,rinvsq);
+                       rinvsix      = _mm_mul_sd(rinvsix,rinvsq);
+                       
+                       vvdw6        = _mm_mul_sd(c6,rinvsix);
+                       vvdw12       = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
+                       vvdwtot      = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
+        
+            fscal        = _mm_sub_sd(_mm_mul_sd(rinvsq, 
+                                                 _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+                                                            _mm_mul_sd(six,vvdw6))),
+                                      _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_sd(fscal,dx);
+            ty           = _mm_mul_sd(fscal,dy);
+            tz           = _mm_mul_sd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_sd(fix,tx);
+            fiy          = _mm_add_sd(fiy,ty);
+            fiz          = _mm_add_sd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* Load i shift forces from memory */
-               xmm1     = _mm_load_sd(fshift+is3);
-               xmm2     = _mm_load_sd(fshift+is3+1);
-               xmm3     = _mm_load_sd(fshift+is3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i shift forces to memory */
-               _mm_store_sd(fshift+is3,fix);
-               _mm_store_sd(fshift+is3+1,fiy);
-               _mm_store_sd(fshift+is3+2,fiz);
-               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* VdW potential */
-               Vvdwtmp  = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
-               Vvdwtot  = _mm_add_pd(Vvdwtot,Vvdwtmp);
-               _mm_storeh_pd(&vdwt,Vvdwtot);
-               Vvdw[ggid] = Vvdw[ggid] + vdwt;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
+        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+        ggid     = gid[n];         
+        
+        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
+        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
        }
 
        *outeriter   = nri;            
     *inneriter   = nj1;        
-       
-       
-       
-       
-       
 }
index 164c3194cf47bfa2009d146c6dfea3bb2767f527..87159d59deb0ca5a9082ab249b0c4f413e1625c0 100644 (file)
 /* get gmx_gbdata_t */
 #include "../nb_kerneltype.h"
 
-
+#include "nb_kernel430_ia32_sse2.h"
 
 void nb_kernel430_ia32_sse2(int *           p_nri,
-                                                       int *           iinr,
-                                                       int *           jindex,
-                                                       int *           jjnr,
-                                                       int *           shift,
-                                                       double *         shiftvec,
-                                                       double *         fshift,
-                                                       int *           gid,
-                                                       double *         pos,
-                                                       double *         faction,
-                                                       double *         charge,
-                                                       double *         p_facel,
-                                                       double *         p_krf,
-                                                       double *         p_crf,
-                                                       double *         Vc,
-                                                       int *           type,
-                                                       int *           p_ntype,
-                                                       double *         vdwparam,
-                                                       double *         Vvdw,
-                                                       double *         p_tabscale,
-                                                       double *         VFtab,
-                                                       double *         invsqrta,
-                                                       double *         dvda,
-                                                       double *         p_gbtabscale,
-                                                       double *         GBtab,
-                                                       int *           p_nthreads,
-                                                       int *           count,
-                                                       void *          mtx,
-                                                       int *           outeriter,
-                                                       int *           inneriter,
-                                                       double *         work)
+                              int *           iinr,
+                              int *           jindex,
+                              int *           jjnr,
+                              int *           shift,
+                              double *         shiftvec,
+                              double *         fshift,
+                              int *           gid,
+                              double *         pos,
+                              double *         faction,
+                              double *         charge,
+                              double *         p_facel,
+                              double *         p_krf,
+                              double *         p_crf,
+                              double *         vc,
+                              int *           type,
+                              int *           p_ntype,
+                              double *         vdwparam,
+                              double *         vvdw,
+                              double *         p_tabscale,
+                              double *         VFtab,
+                              double *         invsqrta,
+                              double *         dvda,
+                              double *         p_gbtabscale,
+                              double *         GBtab,
+                              int *           p_nthreads,
+                              int *           count,
+                              void *          mtx,
+                              int *           outeriter,
+                              int *           inneriter,
+                              double *         work)
 {
-       int           nri,ntype,nthreads,offset,tj,tj2,nti;
-       int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
-       double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
-       double        shX,shY,shZ,isai_d,dva;
+    int           nri,ntype,nthreads;
+    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
+    double        shX,shY,shZ;
+       int                       offset,nti;
+    int           jnrA,jnrB;
+    int           j3A,j3B;
+       int           tjA,tjB;
        gmx_gbdata_t *gbdata;
-       double *        gpol;
-
-       __m128d       ix,iy,iz,jx,jy,jz;
-       __m128d           dx,dy,dz,t1,t2,t3;
-       __m128d           fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
-       __m128d           q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
-       __m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
-       __m128d           xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
-       __m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
-       __m128d       fac,tabscale,gbtabscale,gbfactor;
-       __m128i       n0,nnn;
-       
-       const __m128d neg    = {-1.0f,-1.0f};
-       const __m128d zero   = {0.0f,0.0f};
-       const __m128d half   = {0.5f,0.5f};
-       const __m128d two    = {2.0f,2.0f};
-       const __m128d three  = {3.0f,3.0f};
-       const __m128d six    = {6.0f,6.0f};
-       const __m128d twelwe = {12.0f,12.0f};
+       double *      gpol;
+    
+       __m128d  iq,qq,jq,isai;
+       __m128d  ix,iy,iz;
+       __m128d  jx,jy,jz;
+       __m128d  dx,dy,dz;
+       __m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+       __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
+       __m128d  rinv,isaj,isaprod;
+       __m128d  vcoul,fscal,gbscale,c6,c12;
+       __m128d  rinvsq,r,rtab;
+       __m128d  eps,Y,F,G,H;
+    __m128d  VV,FF,Fp;
+       __m128d  vgb,fijGB,dvdatmp;
+       __m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
+       __m128d  facel,gbtabscale,dvdaj;
+    __m128d  fijD,fijR,fijC;
+    __m128d  xmm1,tabscale,eps2;
+       __m128i  n0, nnn;
+    
        
-       const __m128i four   = _mm_set_epi32(4,4,4,4);
+       const __m128d neg        = _mm_set1_pd(-1.0);
+       const __m128d zero       = _mm_set1_pd(0.0);
+       const __m128d minushalf  = _mm_set1_pd(-0.5);
+       const __m128d two        = _mm_set1_pd(2.0);
        
        gbdata     = (gmx_gbdata_t *)work;
        gpol       = gbdata->gpol;
-
+    
        nri        = *p_nri;
        ntype      = *p_ntype;
-       nthreads   = *p_nthreads; 
-  facel      = *p_facel;      
-       krf        = *p_krf;
-       crf        = *p_crf;
-       tabscl     = *p_tabscale;
-       gbtabscl   = *p_gbtabscale;
-       nj1        = 0;
+    
+    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
+    gbtabscale = _mm_load1_pd(p_gbtabscale);  
+    facel      = _mm_load1_pd(p_facel);
+    tabscale   = _mm_load1_pd(p_tabscale);
+    
+    nj1         = 0;
+    jnrA = jnrB = 0;
+    j3A = j3B   = 0;
+    jx          = _mm_setzero_pd();
+    jy          = _mm_setzero_pd();
+    jz          = _mm_setzero_pd();
+    c6          = _mm_setzero_pd();
+    c12         = _mm_setzero_pd();
        
-       /* Splat variables */
-       fac        = _mm_load1_pd(&facel);
-       tabscale   = _mm_load1_pd(&tabscl);
-       gbtabscale = _mm_load1_pd(&gbtabscl);
-  gbfactor   = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-               
-       /* Keep compiler happy */
-       Vvdwtmp = _mm_setzero_pd();
-       Vvdwtot = _mm_setzero_pd();
-       dvdatmp = _mm_setzero_pd();
-       dvdaj   = _mm_setzero_pd();
-       isaj    = _mm_setzero_pd();
-       vcoul   = _mm_setzero_pd();
-       vgb     = _mm_setzero_pd();
-       t1      = _mm_setzero_pd();
-       t2      = _mm_setzero_pd();
-       t3      = _mm_setzero_pd();
-       xmm1    = _mm_setzero_pd();
-       xmm2    = _mm_setzero_pd();
-       xmm3    = _mm_setzero_pd();
-       xmm4    = _mm_setzero_pd();
-       jnr1    = jnr2 = 0;
-       j13     = j23  = 0;
-               
        for(n=0;n<nri;n++)
        {
-               is3     = 3*shift[n];
-               shX     = shiftvec[is3];
-               shY     = shiftvec[is3+1];
-               shZ     = shiftvec[is3+2];
+        is3              = 3*shift[n];     
+        shX              = shiftvec[is3];  
+        shY              = shiftvec[is3+1];
+        shZ              = shiftvec[is3+2];
+        nj0              = jindex[n];      
+        nj1              = jindex[n+1];    
+        ii               = iinr[n];        
+        ii3              = 3*ii;           
                
-               nj0     = jindex[n];      
-        nj1     = jindex[n+1];  
-               offset  = (nj1-nj0)%2;
+               ix               = _mm_set1_pd(shX+pos[ii3+0]);
+               iy               = _mm_set1_pd(shY+pos[ii3+1]);
+               iz               = _mm_set1_pd(shZ+pos[ii3+2]);
+        
+               iq               = _mm_load1_pd(charge+ii);
+               iq               = _mm_mul_pd(iq,facel);
+        
+               isai             = _mm_load1_pd(invsqrta+ii);
+        
+               nti              = 2*ntype*type[ii];
                
-               ii      = iinr[n];
-               ii3     = ii*3;
-               
-               ix      = _mm_set1_pd(shX+pos[ii3+0]);
-               iy      = _mm_set1_pd(shX+pos[ii3+1]);
-               iz      = _mm_set1_pd(shX+pos[ii3+2]); 
-               q       = _mm_set1_pd(charge[ii]);
-               
-               iq      = _mm_mul_pd(fac,q); 
-               isai_d  = invsqrta[ii];
-               isai    = _mm_load1_pd(&isai_d);
-               
-               nti      = 2*ntype*type[ii];
-               
-               fix     = _mm_setzero_pd();
-               fiy     = _mm_setzero_pd();
-               fiz     = _mm_setzero_pd();
-               dvdasum = _mm_setzero_pd();
-               vctot   = _mm_setzero_pd();
-               vgbtot  = _mm_setzero_pd();
-               Vvdwtot = _mm_setzero_pd();
-               
-               for(k=nj0;k<nj1-offset; k+=2)
+               vctot            = _mm_setzero_pd();
+               vvdwtot          = _mm_setzero_pd();
+               vgbtot           = _mm_setzero_pd();
+               dvdasum          = _mm_setzero_pd();
+               fix              = _mm_setzero_pd();
+               fiy              = _mm_setzero_pd();
+               fiz              = _mm_setzero_pd();
+        
+               for(k=nj0;k<nj1-1; k+=2)
                {
-                       jnr1    = jjnr[k];
-                       jnr2    = jjnr[k+1];
-                       
-                       j13     = jnr1 * 3;
-                       j23     = jnr2 * 3;
-                       
-                       /* Load coordinates */
-                       xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
-                       xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
-                       
-                       xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
-                       xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
-                       
-                       /* transpose */
-                       jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
-                       
-                       /* distances */
-                       dx      = _mm_sub_pd(ix,jx);
-                       dy              = _mm_sub_pd(iy,jy);
-                       dz              = _mm_sub_pd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       /* Load invsqrta */
-                       isaj    = _mm_loadl_pd(isaj,invsqrta+jnr1);
-                       isaj    = _mm_loadh_pd(isaj,invsqrta+jnr2);
-                       isaprod = _mm_mul_pd(isai,isaj);
-                       
-                       /* Load charges */
-                       q               = _mm_loadl_pd(q,charge+jnr1);
-                       q               = _mm_loadh_pd(q,charge+jnr2);
-                       qq              = _mm_mul_pd(iq,q);
-                       
-                       vcoul   = _mm_mul_pd(qq,rinv);
-                       fscal   = _mm_mul_pd(vcoul,rinv);
-                       qq              = _mm_mul_pd(isaprod,qq);
-                       qq              = _mm_mul_pd(qq,neg);
-      qq      = _mm_mul_pd(qq,gbfactor);
-                       gbscale = _mm_mul_pd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       tj2     = nti+2*type[jnr2];
-                       
-                       xmm1      = _mm_loadu_pd(vdwparam+tj);
-                       xmm2     = _mm_loadu_pd(vdwparam+tj2);
-                       c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
-                       c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-                                               
-                       /* Load dvdaj */
-                       dvdaj   = _mm_loadl_pd(dvdaj, dvda+jnr1);
-                       dvdaj   = _mm_loadh_pd(dvdaj, dvda+jnr2);
-                       
-                       /* Calculate GB table index */
-                       r               = _mm_mul_pd(rsq11,rinv);
-                       rt              = _mm_mul_pd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_pd(rt,n0d);
-                       eps2    = _mm_mul_pd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G               = _mm_mul_pd(G,eps);
-                       H               = _mm_mul_pd(H,eps2);
-                       Fp              = _mm_add_pd(F,G);
-                       Fp              = _mm_add_pd(Fp,H);
-                       VV              = _mm_mul_pd(Fp,eps);
-                       VV              = _mm_add_pd(Y,VV);
-                       H               = _mm_mul_pd(two,H);
-                       FF              = _mm_add_pd(Fp,G);
-                       FF              = _mm_add_pd(FF,H);
-                       vgb             = _mm_mul_pd(qq,VV);
-                       fijC    = _mm_mul_pd(qq,FF);
-                       fijC    = _mm_mul_pd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_pd(fijC,r);
-                       dvdatmp = _mm_add_pd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_pd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_pd(dvdatmp,half);
-                       dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_pd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_pd(xmm1,isaj);
-                       dvdaj   = _mm_add_pd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       _mm_storeh_pd(dvda+jnr2,dvdaj);
-                       
-                       vctot   = _mm_add_pd(vctot,vcoul);
-                       vgbtot  = _mm_add_pd(vgbtot,vgb);
-                       
-                       /* Calculate VDW table index */
-                       rt      = _mm_mul_pd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rt);
-                       n0d     = _mm_cvtepi32_pd(n0);
-                       eps     = _mm_sub_pd(rt,n0d);
+                       jnrA    = jjnr[k];
+                       jnrB    = jjnr[k+1];
+                       
+                       j3A     = jnrA * 3;
+                       j3B     = jnrB * 3;
+            
+            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+            
+                       dx           = _mm_sub_pd(ix,jx);
+                       dy           = _mm_sub_pd(iy,jy);
+                       dz           = _mm_sub_pd(iz,jz);
+            
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_pd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+                       GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+            
+            /* Lennard-Jones */
+            tjA          = nti+2*type[jnrA];
+                       tjB          = nti+2*type[jnrB];
+            
+            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+                       
+                       isaprod      = _mm_mul_pd(isai,isaj);
+                       qq           = _mm_mul_pd(iq,jq);            
+                       vcoul        = _mm_mul_pd(qq,rinv);
+                       fscal        = _mm_mul_pd(vcoul,rinv);                                 
+            vctot        = _mm_add_pd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_pd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_pd(rsq,rinv);
+                       rtab         = _mm_mul_pd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_pd(G,eps);
+            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
+            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+            vgb     = _mm_mul_pd(Y, qq);           
+            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+            
+            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_pd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+                       
+            /* Calculate VDW table index */
+                       rtab    = _mm_mul_pd(r,tabscale);
+                       n0      = _mm_cvttpd_epi32(rtab);
+                       eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_pd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-                       /* Tabulated VdW interaction - dispersion */
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_pd(G,eps);
+            /* Dispersion */
+            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -294,23 +243,18 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        FF      = _mm_add_pd(Fp,G);
                        FF      = _mm_add_pd(FF,xmm1);
                        
-                       Vvdw6   = _mm_mul_pd(c6,VV);
+                       vvdw6   = _mm_mul_pd(c6,VV);
                        fijD    = _mm_mul_pd(c6,FF);
-                       
-                       /* Tabulated VdW interaction - repulsion */
-                       nnn     = _mm_add_epi32(nnn,four);
-                       
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_pd(G,eps);
+            
+            /* Dispersion */
+            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_pd(G,eps);
                        H       = _mm_mul_pd(H,eps2);
                        Fp      = _mm_add_pd(F,G);
                        Fp      = _mm_add_pd(Fp,H);
@@ -320,163 +264,124 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        FF      = _mm_add_pd(Fp,G);
                        FF      = _mm_add_pd(FF,xmm1);
                        
-                       Vvdw12  = _mm_mul_pd(c12,VV);
+                       vvdw12  = _mm_mul_pd(c12,VV);
                        fijR    = _mm_mul_pd(c12,FF);
                        
-                       Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-                       
+                       vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
+                       vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
+            
                        xmm1    = _mm_add_pd(fijD,fijR);
                        xmm1    = _mm_mul_pd(xmm1,tabscale);
                        xmm1    = _mm_add_pd(xmm1,fijC);
                        xmm1    = _mm_sub_pd(xmm1,fscal);
                        fscal   = _mm_mul_pd(xmm1,neg);
                        fscal   = _mm_mul_pd(fscal,rinv);
-                                               
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_pd(fscal,dx);
-                       t2              = _mm_mul_pd(fscal,dy);
-                       t3              = _mm_mul_pd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_pd(fix,t1);
-                       fiy             = _mm_add_pd(fiy,t2);
-                       fiz             = _mm_add_pd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm1    = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
-                       xmm2    = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-                       
-                       xmm5    = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
-                       xmm6    = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-                       
-                       /* transpose */
-                       xmm7    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
-                       xmm5    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
-                       xmm6    = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_pd(xmm5,t1);
-                       xmm6    = _mm_sub_pd(xmm6,t2);
-                       xmm7    = _mm_sub_pd(xmm7,t3);
-                       
-                       xmm1    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
-                       xmm2    = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-                       
-                       /* store fx and fy */
-                       _mm_storeu_pd(faction+j13,xmm1);
-                       _mm_storeu_pd(faction+j23,xmm2);
-                       
-                       /* .. then fz */
-                       _mm_storel_pd(faction+j13+2,xmm7);
-                       _mm_storel_pd(faction+j23+2,xmm7);
+            
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_pd(fscal,dx);
+            ty           = _mm_mul_pd(fscal,dy);
+            tz           = _mm_mul_pd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_pd(fix,tx);
+            fiy          = _mm_add_pd(fiy,ty);
+            fiz          = _mm_add_pd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
                }
                
                /* In double precision, offset can only be either 0 or 1 */
-               if(offset!=0)
+               if(k<nj1)
                {
-                       jnr1    = jjnr[k];
-                       j13             = jnr1*3;
-                       
-                       jx      = _mm_load_sd(pos+j13);
-                       jy      = _mm_load_sd(pos+j13+1);
-                       jz      = _mm_load_sd(pos+j13+2);
-                       
-                       isaj    = _mm_load_sd(invsqrta+jnr1);
-                       isaprod = _mm_mul_sd(isai,isaj);
-                       dvdaj   = _mm_load_sd(dvda+jnr1);
-                       q               = _mm_load_sd(charge+jnr1);
-                       qq      = _mm_mul_sd(iq,q);
-                       
-                       dx      = _mm_sub_sd(ix,jx);
-                       dy              = _mm_sub_sd(iy,jy);
-                       dz              = _mm_sub_sd(iz,jz);
-                       
-                       rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
-                       rinv    = gmx_mm_invsqrt_pd(rsq11);
-                       
-                       vcoul   = _mm_mul_sd(qq,rinv);
-                       fscal   = _mm_mul_sd(vcoul,rinv);
-                       qq              = _mm_mul_sd(isaprod,qq);
-                       qq              = _mm_mul_sd(qq,neg);
-      qq      = _mm_mul_sd(qq,gbfactor);
-                       gbscale = _mm_mul_sd(isaprod,gbtabscale);
-                       
-                       /* Load VdW parameters */
-                       tj      = nti+2*type[jnr1];
-                       
-                       c6      = _mm_load_sd(vdwparam+tj);
-                       c12     = _mm_load_sd(vdwparam+tj+1);
-                                       
-                       /* Calculate GB table index */
-                       r               = _mm_mul_sd(rsq11,rinv);
-                       rt              = _mm_mul_sd(r,gbscale);
-                       n0              = _mm_cvttpd_epi32(rt);
-                       n0d             = _mm_cvtepi32_pd(n0);
-                       eps             = _mm_sub_sd(rt,n0d);
-                       eps2    = _mm_mul_sd(eps,eps);
-                       
-                       nnn             = _mm_slli_epi64(n0,2);
-                       
-                       xmm1    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); 
-                       xmm2    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); 
-                       xmm3    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); 
-                       xmm4    = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); 
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
-                       
-                       G               = _mm_mul_sd(G,eps);
-                       H               = _mm_mul_sd(H,eps2);
-                       Fp              = _mm_add_sd(F,G);
-                       Fp              = _mm_add_sd(Fp,H);
-                       VV              = _mm_mul_sd(Fp,eps);
-                       VV              = _mm_add_sd(Y,VV);
-                       H               = _mm_mul_sd(two,H);
-                       FF              = _mm_add_sd(Fp,G);
-                       FF              = _mm_add_sd(FF,H);
-                       vgb             = _mm_mul_sd(qq,VV);
-                       fijC    = _mm_mul_sd(qq,FF);
-                       fijC    = _mm_mul_sd(fijC,gbscale);
-                       
-                       dvdatmp = _mm_mul_sd(fijC,r);
-                       dvdatmp = _mm_add_sd(vgb,dvdatmp);
-                       dvdatmp = _mm_mul_sd(dvdatmp,neg);
-                       dvdatmp = _mm_mul_sd(dvdatmp,half);
-                       dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-                       
-                       xmm1    = _mm_mul_sd(dvdatmp,isaj);
-                       xmm1    = _mm_mul_sd(xmm1,isaj);
-                       dvdaj   = _mm_add_sd(dvdaj,xmm1);
-                       
-                       /* store dvda */
-                       _mm_storel_pd(dvda+jnr1,dvdaj);
-                       
-                       vctot   = _mm_add_sd(vctot,vcoul);
-                       vgbtot  = _mm_add_sd(vgbtot,vgb);
-                       
-                       /* Calculate VDW table index */
-                       rt      = _mm_mul_sd(r,tabscale);
-                       n0      = _mm_cvttpd_epi32(rt);
-                       n0d     = _mm_cvtepi32_pd(n0);
-                       eps     = _mm_sub_sd(rt,n0d);
+                       jnrA    = jjnr[k];
+                       j3A     = jnrA * 3;
+            
+            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+            
+                       dx           = _mm_sub_sd(ix,jx);
+                       dy           = _mm_sub_sd(iy,jy);
+                       dz           = _mm_sub_sd(iz,jz);
+            
+            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
+            
+            rinv         = gmx_mm_invsqrt_pd(rsq);
+                       rinvsq       = _mm_mul_sd(rinv,rinv);
+            
+                       /***********************************/
+                       /* INTERACTION SECTION STARTS HERE */
+                       /***********************************/
+                       GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+                       GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+            
+            /* Lennard-Jones */
+            tjA          = nti+2*type[jnrA];
+            
+            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+                       
+                       isaprod      = _mm_mul_sd(isai,isaj);
+                       qq           = _mm_mul_sd(iq,jq);            
+                       vcoul        = _mm_mul_sd(qq,rinv);
+                       fscal        = _mm_mul_sd(vcoul,rinv);                                 
+            vctot        = _mm_add_sd(vctot,vcoul);
+            
+            /* Polarization interaction */
+                       qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+                       gbscale      = _mm_mul_sd(isaprod,gbtabscale);
+            
+                       /* Calculate GB table index */
+                       r            = _mm_mul_sd(rsq,rinv);
+                       rtab         = _mm_mul_sd(r,gbscale);
+                       
+                       n0                   = _mm_cvttpd_epi32(rtab);
+                       eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+                       nnn                  = _mm_slli_epi32(n0,2);
+                       
+            /* the tables are 16-byte aligned, so we can use _mm_load_pd */                    
+            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_sd(G,eps);
+            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
+            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+            vgb     = _mm_mul_sd(Y, qq);           
+            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+            
+            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+            
+            vgbtot  = _mm_add_sd(vgbtot, vgb);
+            
+            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+            
+            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+                       
+            /* Calculate VDW table index */
+                       rtab    = _mm_mul_sd(r,tabscale);
+                       n0      = _mm_cvttpd_epi32(rtab);
+                       eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
                        eps2    = _mm_mul_sd(eps,eps);
                        nnn     = _mm_slli_epi32(n0,3);
                        
-                       /* Tabulated VdW interaction - dispersion */
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_sd(G,eps);
+            /* Dispersion */
+            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
+            F            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
+            H            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -486,23 +391,18 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        FF      = _mm_add_sd(Fp,G);
                        FF      = _mm_add_sd(FF,xmm1);
                        
-                       Vvdw6   = _mm_mul_sd(c6,VV);
+                       vvdw6   = _mm_mul_sd(c6,VV);
                        fijD    = _mm_mul_sd(c6,FF);
-                       
-                       /* Tabulated VdW interaction - repulsion */
-                       nnn     = _mm_add_epi32(nnn,four);
-                       
-                       xmm1    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
-                       xmm2    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
-                       xmm3    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
-                       xmm4    = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-                       
-                       Y               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
-                       F               = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
-                       G               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
-                       H               = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-                       
-                       G       = _mm_mul_sd(G,eps);
+            
+            /* Dispersion */
+            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
+            F            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(Y,F);
+            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
+            H            = _mm_setzero_pd();
+            GMX_MM_TRANSPOSE2_PD(G,H);
+            
+            G       = _mm_mul_sd(G,eps);
                        H       = _mm_mul_sd(H,eps2);
                        Fp      = _mm_add_sd(F,G);
                        Fp      = _mm_add_sd(Fp,H);
@@ -512,122 +412,47 @@ void nb_kernel430_ia32_sse2(int *           p_nri,
                        FF      = _mm_add_sd(Fp,G);
                        FF      = _mm_add_sd(FF,xmm1);
                        
-                       Vvdw12  = _mm_mul_sd(c12,VV);
+                       vvdw12  = _mm_mul_sd(c12,VV);
                        fijR    = _mm_mul_sd(c12,FF);
                        
-                       Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
-                       Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-                       
+                       vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
+                       vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+            
                        xmm1    = _mm_add_sd(fijD,fijR);
                        xmm1    = _mm_mul_sd(xmm1,tabscale);
                        xmm1    = _mm_add_sd(xmm1,fijC);
                        xmm1    = _mm_sub_sd(xmm1,fscal);
                        fscal   = _mm_mul_sd(xmm1,neg);
                        fscal   = _mm_mul_sd(fscal,rinv);
-                       
-                       /* calculate partial force terms */
-                       t1              = _mm_mul_sd(fscal,dx);
-                       t2              = _mm_mul_sd(fscal,dy);
-                       t3              = _mm_mul_sd(fscal,dz);
-                       
-                       /* update the i force */
-                       fix             = _mm_add_sd(fix,t1);
-                       fiy             = _mm_add_sd(fiy,t2);
-                       fiz             = _mm_add_sd(fiz,t3);
-                       
-                       /* accumulate forces from memory */
-                       xmm5    = _mm_load_sd(faction+j13);   /* fx */
-                       xmm6    = _mm_load_sd(faction+j13+1); /* fy */
-                       xmm7    = _mm_load_sd(faction+j13+2); /* fz */
-                       
-                       /* subtract partial forces */
-                       xmm5    = _mm_sub_sd(xmm5,t1);
-                       xmm6    = _mm_sub_sd(xmm6,t2);
-                       xmm7    = _mm_sub_sd(xmm7,t3);
-                       
-                       /* store forces */
-                       _mm_store_sd(faction+j13,xmm5);
-                       _mm_store_sd(faction+j13+1,xmm6);
-                       _mm_store_sd(faction+j13+2,xmm7);
+
+            /***********************************/
+                       /*  INTERACTION SECTION ENDS HERE  */
+                       /***********************************/
+            
+            /* Calculate temporary vectorial force */
+            tx           = _mm_mul_sd(fscal,dx);
+            ty           = _mm_mul_sd(fscal,dy);
+            tz           = _mm_mul_sd(fscal,dz);
+            
+            /* Increment i atom force */
+            fix          = _mm_add_sd(fix,tx);
+            fiy          = _mm_add_sd(fiy,ty);
+            fiz          = _mm_add_sd(fiz,tz);
+            
+            /* Store j forces back */
+                       GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
                }
                
-               /* fix/fiy/fiz now contain four partial terms, that all should be
-                * added to the i particle forces
-                */
-               t1               = _mm_unpacklo_pd(t1,fix);
-               t2               = _mm_unpacklo_pd(t2,fiy);
-               t3               = _mm_unpacklo_pd(t3,fiz);
-               
-               fix              = _mm_add_pd(fix,t1);
-               fiy              = _mm_add_pd(fiy,t2);
-               fiz              = _mm_add_pd(fiz,t3);
-               
-               fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
-               fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
-               fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-               
-               /* Load i forces from memory */
-               xmm1     = _mm_load_sd(faction+ii3);
-               xmm2     = _mm_load_sd(faction+ii3+1);
-               xmm3     = _mm_load_sd(faction+ii3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(faction+ii3,fix);
-               _mm_store_sd(faction+ii3+1,fiy);
-               _mm_store_sd(faction+ii3+2,fiz);
-               
-               /* Load i shift forces from memory */
-               xmm1     = _mm_load_sd(fshift+is3);
-               xmm2     = _mm_load_sd(fshift+is3+1);
-               xmm3     = _mm_load_sd(fshift+is3+2);
-               
-               /* Add to i force */
-               fix      = _mm_add_sd(fix,xmm1);
-               fiy      = _mm_add_sd(fiy,xmm2);
-               fiz      = _mm_add_sd(fiz,xmm3);
-               
-               /* store i forces to memory */
-               _mm_store_sd(fshift+is3,fix);
-               _mm_store_sd(fshift+is3+1,fiy);
-               _mm_store_sd(fshift+is3+2,fiz);
-               
-               /* now do dvda */
-               dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
-               dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
-               _mm_storeh_pd(&dva,dvdasum);
-               dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-               
-               ggid     = gid[n];
-               
-               /* Coulomb potential */
-               vcoul    = _mm_unpacklo_pd(vcoul,vctot);
-               vctot    = _mm_add_pd(vctot,vcoul);
-               _mm_storeh_pd(&vct,vctot);
-               Vc[ggid] = Vc[ggid] + vct;
-               
-               /* VdW potential */
-               Vvdwtmp  = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
-               Vvdwtot  = _mm_add_pd(Vvdwtot,Vvdwtmp);
-               _mm_storeh_pd(&vdwt,Vvdwtot);
-               Vvdw[ggid] = Vvdw[ggid] + vdwt;
-               
-               /* GB potential */
-               vgb      = _mm_unpacklo_pd(vgb,vgbtot);
-               vgbtot   = _mm_add_pd(vgbtot,vgb);
-               _mm_storeh_pd(&vgbt,vgbtot);
-               gpol[ggid] = gpol[ggid] + vgbt;
+        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+        
+        ggid     = gid[n];         
+        
+        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
+        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
        }
-       
+    
        *outeriter   = nri;            
     *inneriter   = nj1;        
-       
-       
-       
-       
-       
 }
+