*
* Options used when generation this file:
* Language: c
- * Precision: double
+ * Precision: single
* Threads: no
* Software invsqrt: yes
* Prefetch forces: no
#include <xmmintrin.h>
#include <emmintrin.h>
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel400_ia32_sse2.h"
+#include "nb_kernel400_ia32_sse.h"
-void nb_kernel400_ia32_sse2(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+
+void nb_kernel400_ia32_sse(int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
{
int nri,nthreads;
int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
- int jnrA,jnrB;
- int j3A,j3B;
+ float shX,shY,shZ;
+ int offset;
+ int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
+ int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
gmx_gbdata_t *gbdata;
- double * gpol;
+ float * gpol;
+
+ __m128 iq,qq,qqB,jq,jqB,isai;
+ __m128 ix,iy,iz;
+ __m128 jx,jy,jz;
+ __m128 jxB,jyB,jzB;
+ __m128 dx,dy,dz;
+ __m128 dxB,dyB,dzB;
+ __m128 vctot,vgbtot,dvdasum,gbfactor;
+ __m128 fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128 fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
+ __m128 rinv,isaj,isaprod;
+ __m128 rinvB,isajB,isaprodB;
+ __m128 vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB;
+ __m128 r,rtab;
+ __m128 rB,rtabB;
+ __m128 eps,Y,F,G,H;
+ __m128 epsB,YB,FB,GB,HB;
+ __m128 vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
+ __m128 facel,gbtabscale,mask,dvdaj;
+
+ __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
+ __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
+ __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
- __m128d iq,qq,jq,isai;
- __m128d ix,iy,iz;
- __m128d jx,jy,jz;
- __m128d dx,dy,dz;
- __m128d vctot,vgbtot,dvdasum,gbfactor;
- __m128d fix,fiy,fiz,tx,ty,tz,rsq;
- __m128d rinv,isaj,isaprod;
- __m128d vcoul,fscal,gbscale;
- __m128d rinvsq,r,rtab;
- __m128d eps,Y,F,G,H;
- __m128d vgb,fijGB,dvdatmp;
- __m128d facel,gbtabscale,dvdaj;
__m128i n0, nnn;
+ __m128i n0B, nnnB;
- const __m128d neg = _mm_set1_pd(-1.0);
- const __m128d zero = _mm_set1_pd(0.0);
- const __m128d minushalf = _mm_set1_pd(-0.5);
- const __m128d two = _mm_set1_pd(2.0);
+ const __m128 neg = {-1.0f,-1.0f,-1.0f,-1.0f};
+ const __m128 zero = {0.0f,0.0f,0.0f,0.0f};
+ const __m128 minushalf = {-0.5f,-0.5f,-0.5f,-0.5f};
+ const __m128 two = {2.0f,2.0f,2.0f,2.0f};
+
+ gbdata = (gmx_gbdata_t *)work;
+ gpol = gbdata->gpol;
- gbdata = (gmx_gbdata_t *)work;
- gpol = gbdata->gpol;
-
- nri = *p_nri;
+ nri = *p_nri;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
+ gbfactor = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_ps(p_gbtabscale);
+ facel = _mm_load1_ps(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j3A = j3B = j3C = j3D = 0;
+ jx = _mm_setzero_ps();
+ jy = _mm_setzero_ps();
+ jz = _mm_setzero_ps();
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
-
- for(n=0;n<nri;n++)
- {
+ for(n=0; (n<nri); n++)
+ {
is3 = 3*shift[n];
shX = shiftvec[is3];
shY = shiftvec[is3+1];
nj1 = jindex[n+1];
ii = iinr[n];
ii3 = 3*ii;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shY+pos[ii3+1]);
- iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
- iq = _mm_load1_pd(charge+ii);
- iq = _mm_mul_pd(iq,facel);
- isai = _mm_load1_pd(invsqrta+ii);
-
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
-
- for(k=nj0;k<nj1-1; k+=2)
+ ix = _mm_set1_ps(shX+pos[ii3+0]);
+ iy = _mm_set1_ps(shY+pos[ii3+1]);
+ iz = _mm_set1_ps(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_ps(charge+ii);
+ iq = _mm_mul_ps(iq,facel);
+
+ isai = _mm_load1_ps(invsqrta+ii);
+
+ vctot = _mm_setzero_ps();
+ vgbtot = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+ fix = _mm_setzero_ps();
+ fiy = _mm_setzero_ps();
+ fiz = _mm_setzero_ps();
+
+ for(k=nj0; k<nj1-7; k+=8)
{
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ jnrD = jjnr[k+3];
+ jnrE = jjnr[k+4];
+ jnrF = jjnr[k+5];
+ jnrG = jjnr[k+6];
+ jnrH = jjnr[k+7];
+
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ j3D = 3*jnrD;
+ j3E = 3*jnrE;
+ j3F = 3*jnrF;
+ j3G = 3*jnrG;
+ j3H = 3*jnrH;
+
+
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
+
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+ dxB = _mm_sub_ps(ix,jxB);
+ dyB = _mm_sub_ps(iy,jyB);
+ dzB = _mm_sub_ps(iz,jzB);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+ rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
- j3A = jnrA * 3;
- j3B = jnrB * 3;
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
+ rinvB = gmx_mm_invsqrt_ps(rsqB);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
+
+ isaprod = _mm_mul_ps(isai,isaj);
+ isaprodB = _mm_mul_ps(isai,isajB);
+ qq = _mm_mul_ps(iq,jq);
+ qqB = _mm_mul_ps(iq,jqB);
+ vcoul = _mm_mul_ps(qq,rinv);
+ vcoulB = _mm_mul_ps(qqB,rinvB);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ fscalB = _mm_mul_ps(vcoulB,rinvB);
+ vctot = _mm_add_ps(vctot,vcoul);
+ vctot = _mm_add_ps(vctot,vcoulB);
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ /* Polarization interaction */
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ qqB = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ gbscaleB = _mm_mul_ps(isaprodB,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_ps(rsq,rinv);
+ rB = _mm_mul_ps(rsqB,rinvB);
+ rtab = _mm_mul_ps(r,gbscale);
+ rtabB = _mm_mul_ps(rB,gbscaleB);
+
+ n0 = _mm_cvttps_epi32(rtab);
+ n0B = _mm_cvttps_epi32(rtabB);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ epsB = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
+ nnn = _mm_slli_epi32(n0,2);
+ nnnB = _mm_slli_epi32(n0B,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ YB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
+ FB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
+ GB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
+ HB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+ GB = _mm_mul_ps(GB,epsB);
+ HB = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
+ FB = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
+ YB = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
+ FB = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
+ vgbB = _mm_mul_ps(YB, qqB);
+ fijGBB = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
+
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+ dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
+
+ vgbtot = _mm_add_ps(vgbtot, vgb);
+ vgbtot = _mm_add_ps(vgbtot, vgbB);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+ dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_pd(rinv,rinv);
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
+
+ fscal = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
+ fscalB = _mm_mul_ps( _mm_sub_ps( fscalB, fijGBB),rinvB );
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
+ txB = _mm_mul_ps(fscalB,dxB);
+ tyB = _mm_mul_ps(fscalB,dyB);
+ tzB = _mm_mul_ps(fscalB,dzB);
+
+ /* Increment i atom force */
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
+ fix = _mm_add_ps(fix,txB);
+ fiy = _mm_add_ps(fiy,tyB);
+ fiz = _mm_add_ps(fiz,tzB);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
+ }
+ for(;k<nj1-3; k+=4)
+ {
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ jnrD = jjnr[k+3];
+
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ j3D = 3*jnrD;
+
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
- GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
- GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-
- isaprod = _mm_mul_pd(isai,isaj);
- qq = _mm_mul_pd(iq,jq);
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+
+ isaprod = _mm_mul_ps(isai,isaj);
+ qq = _mm_mul_ps(iq,jq);
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ vctot = _mm_add_ps(vctot,vcoul);
/* Polarization interaction */
- qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
/* Calculate GB table index */
- r = _mm_mul_pd(rsq,rinv);
- rtab = _mm_mul_pd(r,gbscale);
-
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
-
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+ r = _mm_mul_ps(rsq,rinv);
+ rtab = _mm_mul_ps(r,gbscale);
- vgbtot = _mm_add_pd(vgbtot, vgb);
+ n0 = _mm_cvttps_epi32(rtab);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-
- fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+ vgbtot = _mm_add_ps(vgbtot, vgb);
+
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+
+ fscal = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
/***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
/* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
/* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
/* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
- }
-
- /* In double precision, offset can only be either 0 or 1 */
- if(k<nj1)
- {
- jnrA = jjnr[k];
- j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+ }
+
+ offset = (nj1-nj0)%4;
+
+ if(offset!=0)
+ {
+ /* Epilogue loop to take care of non-4-multiple j particles */
+ if(offset==1)
+ {
+ jnrA = jjnr[k];
+ j3A = 3*jnrA;
+ GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
+ GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
+ mask = mask1;
+ }
+ else if(offset==2)
+ {
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
+ mask = mask2;
+ }
+ else
+ {
+ /* offset must be 3 */
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
+ GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
+ GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
+ mask = mask3;
+ }
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_sd(rinv,rinv);
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
+ rinv = _mm_and_ps(rinv,mask);
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
- GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
- GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
- isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
+ isaprod = _mm_mul_ps(isai,isaj);
+ qq = _mm_and_ps(_mm_mul_ps(iq,jq),mask);
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ vctot = _mm_add_ps(vctot,vcoul);
/* Polarization interaction */
- qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
/* Calculate GB table index */
- r = _mm_mul_sd(rsq,rinv);
- rtab = _mm_mul_sd(r,gbscale);
+ r = _mm_mul_ps(rsq,rinv);
+ rtab = _mm_mul_ps(r,gbscale);
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
+ n0 = _mm_cvttps_epi32(rtab);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+ vgbtot = _mm_add_ps(vgbtot, vgb);
- vgbtot = _mm_add_sd(vgbtot, vgb);
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
-
- fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+ fscal = _mm_mul_ps( _mm_sub_ps( fscal, fijGB),rinv );
/***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
/* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
/* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
- /* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
- }
+ if(offset==1)
+ {
+ GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
+ }
+ else if(offset==2)
+ {
+ GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
+ }
+ else
+ {
+ /* offset must be 3 */
+ GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
+ }
+ }
+
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
+ gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ ggid = gid[n];
+
+ GMX_MM_UPDATE_1POT_PS(vctot,vc+ggid);
+ GMX_MM_UPDATE_1POT_PS(vgbtot,gpol+ggid);
+ GMX_MM_UPDATE_1POT_PS(dvdasum,dvda+ii);
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
+}
+
+
+
+/*
+ * Gromacs nonbonded kernel nb_kernel400nf
+ * Coulomb interaction: Generalized-Born
+ * VdW interaction: Not calculated
+ * water optimization: No
+ * Calculate forces: no
+ */
+void nb_kernel400nf_x86_64_sse(
+ int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * Vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * Vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
+{
+ int nri,ntype,nthreads;
+ float facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+ int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+ float shX,shY,shZ;
+ float iq;
+ float qq,vcoul,vctot;
+ float r,rt,eps,eps2;
+ int n0,nnn;
+ float Y,F,Geps,Heps2,Fp,VV;
+ float isai,isaj,isaprod,gbscale;
+ float ix1,iy1,iz1;
+ float jx1,jy1,jz1;
+ float dx11,dy11,dz11,rsq11,rinv11;
+ const int fractshift = 12;
+ const int fractmask = 8388607;
+ const int expshift = 23;
+ const int expmask = 2139095040;
+ const int explsb = 8388608;
+ float lu;
+ int iexp,addr;
+ union { unsigned int bval; float fval; } bitpattern,result;
+
+ nri = *p_nri;
+ ntype = *p_ntype;
+ nthreads = *p_nthreads;
+ facel = *p_facel;
+ krf = *p_krf;
+ crf = *p_crf;
+ tabscale = *p_tabscale;
+ gbtabscale = *p_gbtabscale;
+ nj1 = 0;
+
+ for(n=0; (n<nri); n++)
+ {
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
+ ix1 = shX + pos[ii3+0];
+ iy1 = shY + pos[ii3+1];
+ iz1 = shZ + pos[ii3+2];
+ iq = facel*charge[ii];
+ isai = invsqrta[ii];
+ vctot = 0;
- ggid = gid[n];
+ for(k=nj0; (k<nj1); k++)
+ {
+ jnr = jjnr[k];
+ j3 = 3*jnr;
+ jx1 = pos[j3+0];
+ jy1 = pos[j3+1];
+ jz1 = pos[j3+2];
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+ rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
+ bitpattern.fval = rsq11;
+ iexp = (((bitpattern.bval)&expmask)>>expshift);
+ addr = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
+ result.bval = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
+ lu = result.fval;
+ rinv11 = (0.5*lu*(3.0-((rsq11*lu)*lu)));
+ isaj = invsqrta[jnr];
+ isaprod = isai*isaj;
+ qq = iq*charge[jnr];
+ vcoul = qq*rinv11;
+ qq = isaprod*(-qq);
+ gbscale = isaprod*gbtabscale;
+ r = rsq11*rinv11;
+ rt = r*gbscale;
+ n0 = rt;
+ eps = rt-n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Y = GBtab[nnn];
+ F = GBtab[nnn+1];
+ Geps = eps*GBtab[nnn+2];
+ Heps2 = eps2*GBtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ vgb = qq*VV;
+ vctot = vctot + vcoul;
+ }
- gmx_mm_update_1pot_pd(vctot,vc+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ ggid = gid[n];
+ Vc[ggid] = Vc[ggid] + vctot;
+ }
- *outeriter = nri;
- *inneriter = nj1;
+ *outeriter = nri;
+ *inneriter = nj1;
}
+
+
*
* Options used when generation this file:
* Language: c
- * Precision: double
+ * Precision: single
* Threads: no
* Software invsqrt: yes
* Prefetch forces: no
#include <xmmintrin.h>
#include <emmintrin.h>
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel410_ia32_sse2.h"
+#include "nb_kernel410_ia32_sse.h"
-void nb_kernel410_ia32_sse2(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+void nb_kernel410_ia32_sse(int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
{
int nri,ntype,nthreads;
int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ float shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
- int tjA,tjB;
+ int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
+ int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
+ int tjA,tjB,tjC,tjD;
+ int tjE,tjF,tjG,tjH;
gmx_gbdata_t *gbdata;
- double * gpol;
+ float * gpol;
+
+ __m128 iq,qq,qqB,jq,jqB,isai;
+ __m128 ix,iy,iz;
+ __m128 jx,jy,jz;
+ __m128 jxB,jyB,jzB;
+ __m128 dx,dy,dz;
+ __m128 dxB,dyB,dzB;
+ __m128 vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+ __m128 fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128 fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
+ __m128 rinv,isaj,isaprod;
+ __m128 rinvB,isajB,isaprodB;
+ __m128 vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB,c6,c6B,c12,c12B;
+ __m128 rinvsq,r,rtab;
+ __m128 rinvsqB,rB,rtabB;
+ __m128 eps,Y,F,G,H;
+ __m128 epsB,YB,FB,GB,HB;
+ __m128 vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
+ __m128 rinvsix,vvdw6,vvdw12;
+ __m128 rinvsixB,vvdw6B,vvdw12B;
+ __m128 facel,gbtabscale,mask,dvdaj;
+
+ __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
+ __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
+ __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
- __m128d iq,qq,jq,isai;
- __m128d ix,iy,iz;
- __m128d jx,jy,jz;
- __m128d dx,dy,dz;
- __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
- __m128d fix,fiy,fiz,tx,ty,tz,rsq;
- __m128d rinv,isaj,isaprod;
- __m128d vcoul,fscal,gbscale,c6,c12;
- __m128d rinvsq,r,rtab;
- __m128d eps,Y,F,G,H;
- __m128d vgb,fijGB,dvdatmp;
- __m128d rinvsix,vvdw6,vvdw12;
- __m128d facel,gbtabscale,dvdaj;
__m128i n0, nnn;
+ __m128i n0B, nnnB;
- const __m128d neg = _mm_set1_pd(-1.0);
- const __m128d zero = _mm_set1_pd(0.0);
- const __m128d minushalf = _mm_set1_pd(-0.5);
- const __m128d two = _mm_set1_pd(2.0);
- const __m128d six = _mm_set1_pd(6.0);
- const __m128d twelve = _mm_set1_pd(12.0);
-
- gbdata = (gmx_gbdata_t *)work;
- gpol = gbdata->gpol;
+ const __m128 neg = _mm_set1_ps(-1.0f);
+ const __m128 zero = _mm_set1_ps(0.0f);
+ const __m128 minushalf = _mm_set1_ps(-0.5f);
+ const __m128 two = _mm_set1_ps(2.0f);
+ const __m128 six = _mm_set1_ps(6.0f);
+ const __m128 twelve = _mm_set1_ps(12.0f);
- nri = *p_nri;
- ntype = *p_ntype;
+ gbdata = (gmx_gbdata_t *)work;
+ gpol = gbdata->gpol;
+
+ nri = *p_nri;
+ ntype = *p_ntype;
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
+ gbfactor = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_ps(p_gbtabscale);
+ facel = _mm_load1_ps(p_facel);
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
-
- for(n=0;n<nri;n++)
- {
+ nj1 = 0;
+ jnrA = jnrB = jnrC = jnrD = 0;
+ j3A = j3B = j3C = j3D = 0;
+ jx = _mm_setzero_ps();
+ jy = _mm_setzero_ps();
+ jz = _mm_setzero_ps();
+ c6 = _mm_setzero_ps();
+ c12 = _mm_setzero_ps();
+
+
+ for(n=0; (n<nri); n++)
+ {
is3 = 3*shift[n];
shX = shiftvec[is3];
shY = shiftvec[is3+1];
nj1 = jindex[n+1];
ii = iinr[n];
ii3 = 3*ii;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shY+pos[ii3+1]);
- iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
- iq = _mm_load1_pd(charge+ii);
- iq = _mm_mul_pd(iq,facel);
-
- isai = _mm_load1_pd(invsqrta+ii);
+ ix = _mm_set1_ps(shX+pos[ii3+0]);
+ iy = _mm_set1_ps(shY+pos[ii3+1]);
+ iz = _mm_set1_ps(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_ps(charge+ii);
+ iq = _mm_mul_ps(iq,facel);
+
+ isai = _mm_load1_ps(invsqrta+ii);
+
nti = 2*ntype*type[ii];
- vctot = _mm_setzero_pd();
- vvdwtot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
-
- for(k=nj0;k<nj1-1; k+=2)
+ vctot = _mm_setzero_ps();
+ vvdwtot = _mm_setzero_ps();
+ vgbtot = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+ fix = _mm_setzero_ps();
+ fiy = _mm_setzero_ps();
+ fiz = _mm_setzero_ps();
+
+ for(k=nj0; k<nj1-7; k+=8)
{
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
-
- j3A = jnrA * 3;
- j3B = jnrB * 3;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ jnrD = jjnr[k+3];
+ jnrE = jjnr[k+4];
+ jnrF = jjnr[k+5];
+ jnrG = jjnr[k+6];
+ jnrH = jjnr[k+7];
+
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ j3D = 3*jnrD;
+ j3E = 3*jnrE;
+ j3F = 3*jnrF;
+ j3G = 3*jnrG;
+ j3H = 3*jnrH;
+
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+ dxB = _mm_sub_ps(ix,jxB);
+ dyB = _mm_sub_ps(iy,jyB);
+ dzB = _mm_sub_ps(iz,jzB);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+ rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
+
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
+ rinvB = gmx_mm_invsqrt_ps(rsqB);
+ rinvsq = _mm_mul_ps(rinv,rinv);
+ rinvsqB = _mm_mul_ps(rinvB,rinvB);
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_pd(rinv,rinv);
-
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
- GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
- GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
+
/* Lennard-Jones */
tjA = nti+2*type[jnrA];
tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+ tjC = nti+2*type[jnrC];
+ tjD = nti+2*type[jnrD];
+ tjE = nti+2*type[jnrE];
+ tjF = nti+2*type[jnrF];
+ tjG = nti+2*type[jnrG];
+ tjH = nti+2*type[jnrH];
+
+ GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
+ GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjE,vdwparam+tjF,vdwparam+tjG,vdwparam+tjH,c6B,c12B);
- isaprod = _mm_mul_pd(isai,isaj);
- qq = _mm_mul_pd(iq,jq);
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
+ isaprod = _mm_mul_ps(isai,isaj);
+ isaprodB = _mm_mul_ps(isai,isajB);
+ qq = _mm_mul_ps(iq,jq);
+ qqB = _mm_mul_ps(iq,jqB);
+ vcoul = _mm_mul_ps(qq,rinv);
+ vcoulB = _mm_mul_ps(qqB,rinvB);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ fscalB = _mm_mul_ps(vcoulB,rinvB);
+ vctot = _mm_add_ps(vctot,vcoul);
+ vctot = _mm_add_ps(vctot,vcoulB);
/* Polarization interaction */
- qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ qqB = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+ gbscaleB = _mm_mul_ps(isaprodB,gbtabscale);
+
/* Calculate GB table index */
- r = _mm_mul_pd(rsq,rinv);
- rtab = _mm_mul_pd(r,gbscale);
+ r = _mm_mul_ps(rsq,rinv);
+ rB = _mm_mul_ps(rsqB,rinvB);
+ rtab = _mm_mul_ps(r,gbscale);
+ rtabB = _mm_mul_ps(rB,gbscaleB);
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
+ n0 = _mm_cvttps_epi32(rtab);
+ n0B = _mm_cvttps_epi32(rtabB);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ epsB = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
+ nnn = _mm_slli_epi32(n0,2);
+ nnnB = _mm_slli_epi32(n0B,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+ YB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
+ FB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
+ GB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
+ HB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
+
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+ GB = _mm_mul_ps(GB,epsB);
+ HB = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
+ FB = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
+ YB = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
+ FB = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
+ vgbB = _mm_mul_ps(YB, qqB);
+ fijGBB = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
+
+
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+ dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
+
+ vgbtot = _mm_add_ps(vgbtot, vgb);
+ vgbtot = _mm_add_ps(vgbtot, vgbB);
+
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+ dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
+
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
- rinvsix = _mm_mul_pd(rinvsq,rinvsq);
- rinvsix = _mm_mul_pd(rinvsix,rinvsq);
+ rinvsix = _mm_mul_ps(rinvsq,rinvsq);
+ rinvsixB = _mm_mul_ps(rinvsqB,rinvsqB);
+ rinvsix = _mm_mul_ps(rinvsix,rinvsq);
+ rinvsixB = _mm_mul_ps(rinvsixB,rinvsqB);
- vvdw6 = _mm_mul_pd(c6,rinvsix);
- vvdw12 = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
- vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
-
- fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
- _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
- _mm_mul_pd(six,vvdw6))),
- _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
-
+ vvdw6 = _mm_mul_ps(c6,rinvsix);
+ vvdw6B = _mm_mul_ps(c6B,rinvsixB);
+ vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+ vvdw12B = _mm_mul_ps(c12B, _mm_mul_ps(rinvsixB,rinvsixB));
+ vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+ vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12B,vvdw6B));
+
+ fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
+ _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+ _mm_mul_ps(six,vvdw6))),
+ _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
+
+ fscalB = _mm_sub_ps(_mm_mul_ps(rinvsqB,
+ _mm_sub_ps(_mm_mul_ps(twelve,vvdw12B),
+ _mm_mul_ps(six,vvdw6B))),
+ _mm_mul_ps( _mm_sub_ps( fijGBB,fscalB),rinvB ));
+
/***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
-
+
/* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
+ txB = _mm_mul_ps(fscalB,dxB);
+ tyB = _mm_mul_ps(fscalB,dyB);
+ tzB = _mm_mul_ps(fscalB,dzB);
/* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
+ fix = _mm_add_ps(fix,txB);
+ fiy = _mm_add_ps(fiy,tyB);
+ fiz = _mm_add_ps(fiz,tzB);
/* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
- }
-
- /* In double precision, offset can only be either 0 or 1 */
- if(k<nj1)
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
+ }
+ for(;k<nj1-3; k+=4)
{
- jnrA = jjnr[k];
-
- j3A = jnrA * 3;
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ jnrD = jjnr[k+3];
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ j3D = 3*jnrD;
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+ GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_sd(rinv,rinv);
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
+ rinvsq = _mm_mul_ps(rinv,rinv);
/***********************************/
/* INTERACTION SECTION STARTS HERE */
/***********************************/
- GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
- GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+ GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
+ GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
/* Lennard-Jones */
tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+ tjC = nti+2*type[jnrC];
+ tjD = nti+2*type[jnrD];
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+ GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
- isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
+ isaprod = _mm_mul_ps(isai,isaj);
+ qq = _mm_mul_ps(iq,jq);
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ vctot = _mm_add_ps(vctot,vcoul);
/* Polarization interaction */
- qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
/* Calculate GB table index */
- r = _mm_mul_sd(rsq,rinv);
- rtab = _mm_mul_sd(r,gbscale);
+ r = _mm_mul_ps(rsq,rinv);
+ rtab = _mm_mul_ps(r,gbscale);
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
+ n0 = _mm_cvttps_epi32(rtab);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ nnn = _mm_slli_epi32(n0,2);
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_ps(vgbtot, vgb);
+
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+
+ GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
+
- rinvsix = _mm_mul_sd(rinvsq,rinvsq);
- rinvsix = _mm_mul_sd(rinvsix,rinvsq);
+ rinvsix = _mm_mul_ps(rinvsq,rinvsq);
+ rinvsix = _mm_mul_ps(rinvsix,rinvsq);
- vvdw6 = _mm_mul_sd(c6,rinvsix);
- vvdw12 = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
- vvdwtot = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
-
- fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
- _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
- _mm_mul_sd(six,vvdw6))),
- _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+ vvdw6 = _mm_mul_ps(c6,rinvsix);
+ vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+ vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+
+ fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
+ _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+ _mm_mul_ps(six,vvdw6))),
+ _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
/***********************************/
/* INTERACTION SECTION ENDS HERE */
/***********************************/
/* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
/* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
/* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
- }
+ GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
+ }
+
+ offset = (nj1-nj0)%4;
+
+ if(offset!=0)
+ {
+ /* Epilogue loop to take care of non-4-multiple j particles */
+ if(offset==1)
+ {
+ jnrA = jjnr[k];
+ j3A = 3*jnrA;
+ tjA = nti+2*type[jnrA];
+ GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
+ GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
+ GMX_MM_LOAD_1PAIR_PS(vdwparam+tjA,c6,c12);
+ mask = mask1;
+ }
+ else if(offset==2)
+ {
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+ GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
+ GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
+ GMX_MM_LOAD_2PAIRS_PS(vdwparam+tjA,vdwparam+tjB,c6,c12);
+ mask = mask2;
+ }
+ else
+ {
+ /* offset must be 3 */
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+ jnrC = jjnr[k+2];
+ j3A = 3*jnrA;
+ j3B = 3*jnrB;
+ j3C = 3*jnrC;
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+ tjC = nti+2*type[jnrC];
+ GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
+ GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
+ GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
+ GMX_MM_LOAD_3PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,c6,c12);
+ mask = mask3;
+ }
+
+ dx = _mm_sub_ps(ix,jx);
+ dy = _mm_sub_ps(iy,jy);
+ dz = _mm_sub_ps(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
+
+ /* invsqrt */
+ rinv = gmx_mm_invsqrt_ps(rsq);
+ rinv = _mm_and_ps(rinv,mask);
+
+ rinvsq = _mm_mul_ps(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ isaprod = _mm_mul_ps(isai,isaj);
+ qq = _mm_and_ps(_mm_mul_ps(iq,jq),mask);
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ vctot = _mm_add_ps(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
+ gbscale = _mm_mul_ps(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_ps(rsq,rinv);
+ rtab = _mm_mul_ps(r,gbscale);
+
+ n0 = _mm_cvttps_epi32(rtab);
+ eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
+ F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
+ G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
+ H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
+ _MM_TRANSPOSE4_PS(Y,F,G,H);
+
+ G = _mm_mul_ps(G,eps);
+ H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
+ F = _mm_add_ps(F, _mm_add_ps( G , H ) );
+ Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
+ F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
+ vgb = _mm_mul_ps(Y, qq);
+ fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
+
+ dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
+ vgbtot = _mm_add_ps(vgbtot, vgb);
+
+ dvdasum = _mm_add_ps(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
+
+ /* Lennard-Jones */
+
+ rinvsix = _mm_mul_ps(rinvsq,rinvsq);
+ rinvsix = _mm_mul_ps(rinvsix,rinvsq);
+
+ vvdw6 = _mm_mul_ps(c6,rinvsix);
+ vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
+ vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
+
+ fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
+ _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
+ _mm_mul_ps(six,vvdw6))),
+ _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_ps(fscal,dx);
+ ty = _mm_mul_ps(fscal,dy);
+ tz = _mm_mul_ps(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_ps(fix,tx);
+ fiy = _mm_add_ps(fiy,ty);
+ fiz = _mm_add_ps(fiz,tz);
+
+ if(offset==1)
+ {
+ GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
+ }
+ else if(offset==2)
+ {
+ GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
+ }
+ else
+ {
+ /* offset must be 3 */
+ GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
+ GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
+ }
+ }
+
+ dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
+ gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ ggid = gid[n];
+ GMX_MM_UPDATE_4POT_PS(vctot,vc+ggid,vvdwtot,vvdw+ggid,vgbtot,gpol+ggid,dvdasum,dvda+ii);
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
+}
+
- ggid = gid[n];
-
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
- *outeriter = nri;
- *inneriter = nj1;
+/*
+ * Gromacs nonbonded kernel nb_kernel410nf
+ * Coulomb interaction: Generalized-Born
+ * VdW interaction: Lennard-Jones
+ * water optimization: No
+ * Calculate forces: no
+ */
+void nb_kernel410nf_x86_64_sse(
+ int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * Vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
+{
+ int nri,ntype,nthreads;
+ float facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+ int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+ float shX,shY,shZ;
+ float rinvsq;
+ float iq;
+ float qq,vcoul,vctot;
+ int nti;
+ int tj;
+ float rinvsix;
+ float vvdw6,vvdwtot;
+ float vvdw12;
+ float r,rt,eps,eps2;
+ int n0,nnn;
+ float Y,F,Geps,Heps2,Fp,VV;
+ float isai,isaj,isaprod,gbscale;
+ float ix1,iy1,iz1;
+ float jx1,jy1,jz1;
+ float dx11,dy11,dz11,rsq11,rinv11;
+ float c6,c12;
+ const int fractshift = 12;
+ const int fractmask = 8388607;
+ const int expshift = 23;
+ const int expmask = 2139095040;
+ const int explsb = 8388608;
+ float lu;
+ int iexp,addr;
+ union { unsigned int bval; float fval; } bitpattern,result;
+
+ nri = *p_nri;
+ ntype = *p_ntype;
+ nthreads = *p_nthreads;
+ facel = *p_facel;
+ krf = *p_krf;
+ crf = *p_crf;
+ tabscale = *p_tabscale;
+ gbtabscale = *p_gbtabscale;
+ nj1 = 0;
+
+ for(n=0; (n<nri); n++)
+ {
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
+ ix1 = shX + pos[ii3+0];
+ iy1 = shY + pos[ii3+1];
+ iz1 = shZ + pos[ii3+2];
+ iq = facel*charge[ii];
+ isai = invsqrta[ii];
+ nti = 2*ntype*type[ii];
+ vctot = 0;
+ vvdwtot = 0;
+
+ for(k=nj0; (k<nj1); k++)
+ {
+ jnr = jjnr[k];
+ j3 = 3*jnr;
+ jx1 = pos[j3+0];
+ jy1 = pos[j3+1];
+ jz1 = pos[j3+2];
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+ rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
+ bitpattern.fval = rsq11;
+ iexp = (((bitpattern.bval)&expmask)>>expshift);
+ addr = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
+ result.bval = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
+ lu = result.fval;
+ rinv11 = (0.5*lu*(3.0-((rsq11*lu)*lu)));
+ isaj = invsqrta[jnr];
+ isaprod = isai*isaj;
+ qq = iq*charge[jnr];
+ vcoul = qq*rinv11;
+ qq = isaprod*(-qq);
+ gbscale = isaprod*gbtabscale;
+ tj = nti+2*type[jnr];
+ c6 = vdwparam[tj];
+ c12 = vdwparam[tj+1];
+ rinvsq = rinv11*rinv11;
+ r = rsq11*rinv11;
+ rt = r*gbscale;
+ n0 = rt;
+ eps = rt-n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Y = GBtab[nnn];
+ F = GBtab[nnn+1];
+ Geps = eps*GBtab[nnn+2];
+ Heps2 = eps2*GBtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ vgb = qq*VV;
+ vctot = vctot + vcoul;
+ rinvsix = rinvsq*rinvsq*rinvsq;
+ vvdw6 = c6*rinvsix;
+ vvdw12 = c12*rinvsix*rinvsix;
+ vvdwtot = vvdwtot+vvdw12-vvdw6;
+ }
+
+ ggid = gid[n];
+ Vc[ggid] = Vc[ggid] + vctot;
+ vvdw[ggid] = vvdw[ggid] + vvdwtot;
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
}
+
+
*
* Options used when generation this file:
* Language: c
- * Precision: double
+ * Precision: single
* Threads: no
* Software invsqrt: yes
* Prefetch forces: no
#include<math.h>
#include<vec.h>
+
#include <xmmintrin.h>
#include <emmintrin.h>
-#include <gmx_sse2_double.h>
+#include <gmx_sse2_single.h>
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-#include "nb_kernel430_ia32_sse2.h"
+#include "nb_kernel430_ia32_sse.h"
+
+
-void nb_kernel430_ia32_sse2(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+void nb_kernel430_ia32_sse(int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * Vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * Vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
{
int nri,ntype,nthreads;
+ float facel,krf,crf,tabscale,gbtabscale;
int n,ii,is3,ii3,k,nj0,nj1,ggid;
- double shX,shY,shZ;
+ float shX,shY,shZ;
int offset,nti;
- int jnrA,jnrB;
- int j3A,j3B;
- int tjA,tjB;
+ int jnr,jnr2,jnr3,jnr4,j3,j23,j33,j43;
+ int tj,tj2,tj3,tj4;
gmx_gbdata_t *gbdata;
- double * gpol;
-
- __m128d iq,qq,jq,isai;
- __m128d ix,iy,iz;
- __m128d jx,jy,jz;
- __m128d dx,dy,dz;
- __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
- __m128d fix,fiy,fiz,tx,ty,tz,rsq;
- __m128d rinv,isaj,isaprod;
- __m128d vcoul,fscal,gbscale,c6,c12;
- __m128d rinvsq,r,rtab;
- __m128d eps,Y,F,G,H;
- __m128d VV,FF,Fp;
- __m128d vgb,fijGB,dvdatmp;
- __m128d rinvsix,vvdw6,vvdw12,vvdwtmp;
- __m128d facel,gbtabscale,dvdaj;
- __m128d fijD,fijR,fijC;
- __m128d xmm1,tabscale,eps2;
+ float * gpol;
+
+ __m128 iq,qq,q,isai;
+ __m128 ix,iy,iz;
+ __m128 jx,jy,jz;
+ __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+ __m128 dx1,dy1,dz1;
+ __m128 vctot,Vvdwtot,dvdasum;
+ __m128 fix,fiy,fiz,rsq;
+ __m128 t1,t2,t3;
+ __m128 rinv,isaj,isaprod;
+ __m128 vcoul,fscal,gbscale,c6,c12;
+ __m128 rinvsq,dvdaj,r,rt;
+ __m128 eps,eps2,Y,F,G,H,Geps,Heps2;
+ __m128 Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp;
+ __m128 rinvsix,Vvdw6,Vvdw12,Vvdwtmp,vgbtot,n0f;
+ __m128 fac_sse,tabscale_sse,gbtabscale_sse;
+
__m128i n0, nnn;
-
+ const __m128 neg = _mm_set1_ps(-1.0f);
+ const __m128 zero = _mm_set1_ps(0.0f);
+ const __m128 half = _mm_set1_ps(0.5f);
+ const __m128 two = _mm_set1_ps(2.0f);
+ const __m128 three = _mm_set1_ps(3.0f);
+ const __m128 six = _mm_set1_ps(6.0f);
+ const __m128 twelwe = _mm_set1_ps(12.0f);
- const __m128d neg = _mm_set1_pd(-1.0);
- const __m128d zero = _mm_set1_pd(0.0);
- const __m128d minushalf = _mm_set1_pd(-0.5);
- const __m128d two = _mm_set1_pd(2.0);
+ __m128i four = _mm_set1_epi32(4);
+ __m128i maski = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
+ __m128i mask = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
- gbdata = (gmx_gbdata_t *)work;
- gpol = gbdata->gpol;
-
- nri = *p_nri;
- ntype = *p_ntype;
-
- gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
- gbtabscale = _mm_load1_pd(p_gbtabscale);
- facel = _mm_load1_pd(p_facel);
- tabscale = _mm_load1_pd(p_tabscale);
-
- nj1 = 0;
- jnrA = jnrB = 0;
- j3A = j3B = 0;
- jx = _mm_setzero_pd();
- jy = _mm_setzero_pd();
- jz = _mm_setzero_pd();
- c6 = _mm_setzero_pd();
- c12 = _mm_setzero_pd();
+ float vct,vdwt,vgbt,dva,isai_f;
- for(n=0;n<nri;n++)
- {
+ gbdata = (gmx_gbdata_t *)work;
+ gpol = gbdata->gpol;
+
+ nri = *p_nri;
+ ntype = *p_ntype;
+ nthreads = *p_nthreads;
+ facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
+ krf = *p_krf;
+ crf = *p_crf;
+ tabscale = *p_tabscale;
+ gbtabscale = *p_gbtabscale;
+ nj1 = 0;
+
+ /* Splat variables */
+ fac_sse = _mm_load1_ps(&facel);
+ tabscale_sse = _mm_load1_ps(&tabscale);
+ gbtabscale_sse = _mm_load1_ps(&gbtabscale);
+
+
+ /* Keep the compiler happy */
+ Vvdwtmp = _mm_setzero_ps();
+ dvdatmp = _mm_setzero_ps();
+ vcoul = _mm_setzero_ps();
+ vgb = _mm_setzero_ps();
+ t1 = _mm_setzero_ps();
+ t2 = _mm_setzero_ps();
+ t3 = _mm_setzero_ps();
+ xmm1 = _mm_setzero_ps();
+ xmm2 = _mm_setzero_ps();
+ xmm3 = _mm_setzero_ps();
+ xmm4 = _mm_setzero_ps();
+ j23 = j33 = 0;
+ jnr2 = jnr3 = 0;
+
+ for(n=0; (n<nri); n++)
+ {
is3 = 3*shift[n];
shX = shiftvec[is3];
shY = shiftvec[is3+1];
nj1 = jindex[n+1];
ii = iinr[n];
ii3 = 3*ii;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shY+pos[ii3+1]);
- iz = _mm_set1_pd(shZ+pos[ii3+2]);
-
- iq = _mm_load1_pd(charge+ii);
- iq = _mm_mul_pd(iq,facel);
-
- isai = _mm_load1_pd(invsqrta+ii);
+ ix = _mm_set1_ps(shX+pos[ii3+0]);
+ iy = _mm_set1_ps(shY+pos[ii3+1]);
+ iz = _mm_set1_ps(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_ps(charge+ii);
+ iq = _mm_mul_ps(iq,fac_sse);
+
+ isai_f = invsqrta[ii];
+ isai = _mm_load1_ps(&isai_f);
+
nti = 2*ntype*type[ii];
- vctot = _mm_setzero_pd();
- vvdwtot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
+ vctot = _mm_setzero_ps();
+ Vvdwtot = _mm_setzero_ps();
+ dvdasum = _mm_setzero_ps();
+ vgbtot = _mm_setzero_ps();
+ fix = _mm_setzero_ps();
+ fiy = _mm_setzero_ps();
+ fiz = _mm_setzero_ps();
+
+ offset = (nj1-nj0)%4;
- for(k=nj0;k<nj1-1; k+=2)
+ for(k=nj0;k<nj1-offset;k+=4)
{
- jnrA = jjnr[k];
- jnrB = jjnr[k+1];
-
- j3A = jnrA * 3;
- j3B = jnrB * 3;
-
- GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
-
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_pd(rinv,rinv);
-
- /***********************************/
- /* INTERACTION SECTION STARTS HERE */
- /***********************************/
- GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
- GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
-
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
- tjB = nti+2*type[jnrB];
-
- GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
-
- isaprod = _mm_mul_pd(isai,isaj);
- qq = _mm_mul_pd(iq,jq);
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- vctot = _mm_add_pd(vctot,vcoul);
-
- /* Polarization interaction */
- qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Calculate GB table index */
- r = _mm_mul_pd(rsq,rinv);
- rtab = _mm_mul_pd(r,gbscale);
-
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
-
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
- F = _mm_add_pd(F, _mm_add_pd( G , H ) );
- Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
- F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
- vgb = _mm_mul_pd(Y, qq);
- fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
-
- dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_pd(vgbtot, vgb);
-
- dvdasum = _mm_add_pd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
-
- GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
-
- /* Calculate VDW table index */
- rtab = _mm_mul_pd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
- eps2 = _mm_mul_pd(eps,eps);
+ jnr = jjnr[k];
+ jnr2 = jjnr[k+1];
+ jnr3 = jjnr[k+2];
+ jnr4 = jjnr[k+3];
+
+ j3 = 3*jnr;
+ j23 = 3*jnr2;
+ j33 = 3*jnr3;
+ j43 = 3*jnr4;
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3)); /* x1 y1 - - */
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23)); /* x2 y2 - - */
+ xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (pos+j33)); /* x3 y3 - - */
+ xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (pos+j43)); /* x4 y4 - - */
+
+ xmm5 = _mm_load1_ps(pos+j3+2);
+ xmm6 = _mm_load1_ps(pos+j23+2);
+ xmm7 = _mm_load1_ps(pos+j33+2);
+ xmm8 = _mm_load1_ps(pos+j43+2);
+
+ /* transpose */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+ jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+ jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+ jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+
+ dx1 = _mm_sub_ps(ix,jx);
+ dy1 = _mm_sub_ps(iy,jy);
+ dz1 = _mm_sub_ps(iz,jz);
+
+ t1 = _mm_mul_ps(dx1,dx1);
+ t2 = _mm_mul_ps(dy1,dy1);
+ t3 = _mm_mul_ps(dz1,dz1);
+
+ rsq = _mm_add_ps(t1,t2);
+ rsq = _mm_add_ps(rsq,t3);
+ rinv = gmx_mm_invsqrt_ps(rsq);
+
+ xmm1 = _mm_load_ss(invsqrta+jnr);
+ xmm2 = _mm_load_ss(invsqrta+jnr2);
+ xmm3 = _mm_load_ss(invsqrta+jnr3);
+ xmm4 = _mm_load_ss(invsqrta+jnr4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+ isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ isaprod = _mm_mul_ps(isai,isaj);
+
+ xmm1 = _mm_load_ss(charge+jnr);
+ xmm2 = _mm_load_ss(charge+jnr2);
+ xmm3 = _mm_load_ss(charge+jnr3);
+ xmm4 = _mm_load_ss(charge+jnr4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+ q = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+ qq = _mm_mul_ps(iq,q);
+
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+ qq = _mm_mul_ps(qq,neg);
+ qq = _mm_mul_ps(isaprod,qq);
+ gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
+
+ tj = nti+2*type[jnr];
+ tj2 = nti+2*type[jnr2];
+ tj3 = nti+2*type[jnr3];
+ tj4 = nti+2*type[jnr4];
+
+ xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
+ xmm2 = _mm_loadl_pi(xmm2,(__m64 *) (vdwparam+tj3));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (vdwparam+tj4));
+
+ xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2)); /* c61 c62 c61 c62 */
+ xmm4 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(0,2,0,2)); /* c63 c64 c63 c64 */
+ c6 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,1,0,1)); /* c61 c62 c63 c64 */
+
+ xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /* c121 c122 c121 c122 */
+ xmm4 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,3,1)); /* c123 c124 c123 c124 */
+ c12 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(1,0,1,0)); /* c121 c122 c123 c124 */
+
+ xmm1 = _mm_load_ss(dvda+jnr);
+ xmm2 = _mm_load_ss(dvda+jnr2);
+ xmm3 = _mm_load_ss(dvda+jnr3);
+ xmm4 = _mm_load_ss(dvda+jnr4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
+ dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ /* Calculate GB table index */
+ r = _mm_mul_ps(rsq,rinv);
+ rt = _mm_mul_ps(r,gbscale);
+
+ n0 = _mm_cvttps_epi32(rt);
+ n0f = _mm_cvtepi32_ps(n0);
+ eps = _mm_sub_ps(rt,n0f);
+
+ eps2 = _mm_mul_ps(eps,eps);
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+
+ vgb = _mm_mul_ps(qq,VV);
+ fijC = _mm_mul_ps(qq,FF);
+ fijC = _mm_mul_ps(fijC,gbscale);
+
+ dvdatmp = _mm_mul_ps(fijC,r);
+ dvdatmp = _mm_add_ps(vgb,dvdatmp);
+ dvdatmp = _mm_mul_ps(half,dvdatmp);
+ dvdatmp = _mm_mul_ps(neg,dvdatmp);
+
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+
+ xmm1 = _mm_mul_ps(dvdatmp,isaj);
+ xmm1 = _mm_mul_ps(xmm1,isaj);
+ dvdaj = _mm_add_ps(dvdaj,xmm1);
+
+ _mm_store_ss(dvda+jnr,dvdaj);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(dvda+jnr2,xmm1);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2));
+ _mm_store_ss(dvda+jnr3,xmm1);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(2,1,0,3));
+ _mm_store_ss(dvda+jnr4,xmm1);
+
+ vctot = _mm_add_ps(vctot,vcoul);
+ vgbtot = _mm_add_ps(vgbtot,vgb);
+
+ /* Calculate VDW table index */
+ rt = _mm_mul_ps(r,tabscale_sse);
+ n0 = _mm_cvttps_epi32(rt);
+ n0f = _mm_cvtepi32_ps(n0);
+ eps = _mm_sub_ps(rt,n0f);
+ eps2 = _mm_mul_ps(eps,eps);
nnn = _mm_slli_epi32(n0,3);
+
+ /* Tabulated VdW interaction - disperion */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+
+ Vvdw6 = _mm_mul_ps(c6,VV);
+ fijD = _mm_mul_ps(c6,FF);
+
+ /* Tabulated VdW interaction - repulsion */
+ nnn = _mm_add_epi32(nnn,four);
+
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+
+ Vvdw12 = _mm_mul_ps(c12,VV);
+ fijR = _mm_mul_ps(c12,FF);
+
+ Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
+ Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
+
+ xmm1 = _mm_add_ps(fijD,fijR);
+ xmm1 = _mm_mul_ps(xmm1,tabscale_sse);
+ xmm1 = _mm_add_ps(xmm1,fijC);
+ xmm1 = _mm_sub_ps(xmm1,fscal);
+ fscal = _mm_mul_ps(xmm1,neg);
+ fscal = _mm_mul_ps(fscal,rinv);
+
+ t1 = _mm_mul_ps(fscal,dx1);
+ t2 = _mm_mul_ps(fscal,dy1);
+ t3 = _mm_mul_ps(fscal,dz1);
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3)); /* fx1 fy1 - - */
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); /* fx1 fy1 - - */
+ xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33)); /* fx3 fy3 - - */
+ xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (faction+j43)); /* fx4 fy4 - - */
+
+ xmm5 = _mm_load1_ps(faction+j3+2);
+ xmm6 = _mm_load1_ps(faction+j23+2);
+ xmm7 = _mm_load1_ps(faction+j33+2);
+ xmm8 = _mm_load1_ps(faction+j43+2);
+
+ /* transpose the forces */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+ xmm7 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+
+ /* subtract partial terms */
+ xmm5 = _mm_sub_ps(xmm5,t1);
+ xmm6 = _mm_sub_ps(xmm6,t2);
+ xmm7 = _mm_sub_ps(xmm7,t3);
+
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
+ xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
+ xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
+ xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
+
+ /* store fx's and fy's */
+ _mm_storel_pi( (__m64 *) (faction+j3),xmm1);
+ _mm_storeh_pi( (__m64 *) (faction+j23),xmm1);
+ _mm_storel_pi( (__m64 *) (faction+j33),xmm2);
+ _mm_storeh_pi( (__m64 *) (faction+j43),xmm2);
+
+ /* ..then z */
+ _mm_store_ss(faction+j3+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j23+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j33+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j43+2,xmm7);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- xmm1 = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,xmm1);
-
- vvdw6 = _mm_mul_pd(c6,VV);
- fijD = _mm_mul_pd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- xmm1 = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,xmm1);
-
- vvdw12 = _mm_mul_pd(c12,VV);
- fijR = _mm_mul_pd(c12,FF);
-
- vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
- vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
-
- xmm1 = _mm_add_pd(fijD,fijR);
- xmm1 = _mm_mul_pd(xmm1,tabscale);
- xmm1 = _mm_add_pd(xmm1,fijC);
- xmm1 = _mm_sub_pd(xmm1,fscal);
- fscal = _mm_mul_pd(xmm1,neg);
- fscal = _mm_mul_pd(fscal,rinv);
-
- /***********************************/
- /* INTERACTION SECTION ENDS HERE */
- /***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_pd(fscal,dx);
- ty = _mm_mul_pd(fscal,dy);
- tz = _mm_mul_pd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_pd(fix,tx);
- fiy = _mm_add_pd(fiy,ty);
- fiz = _mm_add_pd(fiz,tz);
-
- /* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
-
- /* In double precision, offset can only be either 0 or 1 */
- if(k<nj1)
+
+ if(offset!=0)
{
- jnrA = jjnr[k];
- j3A = jnrA * 3;
-
- GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
-
- rinv = gmx_mm_invsqrt_pd(rsq);
- rinvsq = _mm_mul_sd(rinv,rinv);
-
- /***********************************/
- /* INTERACTION SECTION STARTS HERE */
- /***********************************/
- GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
- GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
-
- /* Lennard-Jones */
- tjA = nti+2*type[jnrA];
-
- GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
-
- isaprod = _mm_mul_sd(isai,isaj);
- qq = _mm_mul_sd(iq,jq);
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- vctot = _mm_add_sd(vctot,vcoul);
-
- /* Polarization interaction */
- qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- /* Calculate GB table index */
- r = _mm_mul_sd(rsq,rinv);
- rtab = _mm_mul_sd(r,gbscale);
-
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
- nnn = _mm_slli_epi32(n0,2);
-
- /* the tables are 16-byte aligned, so we can use _mm_load_pd */
- Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
- F = _mm_add_sd(F, _mm_add_sd( G , H ) );
- Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
- F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
- vgb = _mm_mul_sd(Y, qq);
- fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
-
- dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
-
- vgbtot = _mm_add_sd(vgbtot, vgb);
-
- dvdasum = _mm_add_sd(dvdasum, dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
-
- GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
-
- /* Calculate VDW table index */
- rtab = _mm_mul_sd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rtab);
- eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
- eps2 = _mm_mul_sd(eps,eps);
+ if(offset==1)
+ {
+ jnr = jjnr[k];
+ j3 = jnr*3;
+ tj = nti+2*type[jnr];
+
+ xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (pos+j3));
+ xmm5 = _mm_load1_ps(pos+j3+2);
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+
+ isaj = _mm_load_ss(invsqrta+jnr);
+ dvdaj = _mm_load_ss(dvda+jnr);
+ q = _mm_load_ss(charge+jnr);
+ c6 = _mm_load_ss(vdwparam+tj);
+ c12 = _mm_load_ss(vdwparam+tj+1);
+
+ mask = _mm_set_epi32(0,0,0,0xffffffff);
+
+ }
+ else if(offset==2)
+ {
+ jnr = jjnr[k];
+ jnr2 = jjnr[k+1];
+
+ j3 = jnr*3;
+ j23 = jnr2*3;
+
+ tj = nti+2*type[jnr];
+ tj2 = nti+2*type[jnr2];
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23));
+
+ xmm5 = _mm_load1_ps(pos+j3+2);
+ xmm6 = _mm_load1_ps(pos+j23+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load_ss(invsqrta+jnr);
+ xmm2 = _mm_load_ss(invsqrta+jnr2);
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ isaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_load_ss(dvda+jnr);
+ xmm2 = _mm_load_ss(dvda+jnr2);
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ dvdaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_load_ss(charge+jnr);
+ xmm2 = _mm_load_ss(charge+jnr2);
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ q = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
+
+ c6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+ c12 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+
+ mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+ }
+ else
+ {
+ jnr = jjnr[k];
+ jnr2 = jjnr[k+1];
+ jnr3 = jjnr[k+2];
+
+ j3 = jnr*3;
+ j23 = jnr2*3;
+ j33 = jnr3*3;
+
+ tj = nti+2*type[jnr];
+ tj2 = nti+2*type[jnr2];
+ tj3 = nti+2*type[jnr3];
+
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (pos+j3));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (pos+j23));
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (pos+j33));
+
+ xmm5 = _mm_load1_ps(pos+j3+2);
+ xmm6 = _mm_load1_ps(pos+j23+2);
+ xmm7 = _mm_load1_ps(pos+j33+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load_ss(invsqrta+jnr);
+ xmm2 = _mm_load_ss(invsqrta+jnr2);
+ xmm3 = _mm_load_ss(invsqrta+jnr3);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
+ isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_load_ss(dvda+jnr);
+ xmm2 = _mm_load_ss(dvda+jnr2);
+ xmm3 = _mm_load_ss(dvda+jnr3);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
+ dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_load_ss(charge+jnr);
+ xmm2 = _mm_load_ss(charge+jnr2);
+ xmm3 = _mm_load_ss(charge+jnr3);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
+ q = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (vdwparam+tj));
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (vdwparam+tj2));
+ xmm2 = _mm_loadl_pi(xmm2, (__m64 *) (vdwparam+tj3));
+
+ xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2));
+ c6 = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,0,0,1));
+
+ xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+ c12 = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,1,1,0));
+
+ mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ jx = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
+ jy = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
+ jz = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
+
+ c6 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
+ c12 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
+ dvdaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
+ isaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);
+ q = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
+
+ dx1 = _mm_sub_ps(ix,jx);
+ dy1 = _mm_sub_ps(iy,jy);
+ dz1 = _mm_sub_ps(iz,jz);
+
+ t1 = _mm_mul_ps(dx1,dx1);
+ t2 = _mm_mul_ps(dy1,dy1);
+ t3 = _mm_mul_ps(dz1,dz1);
+
+ rsq = _mm_add_ps(t1,t2);
+ rsq = _mm_add_ps(rsq,t3);
+
+ rinv = gmx_mm_invsqrt_ps(rsq);
+
+ isaprod = _mm_mul_ps(isai,isaj);
+ qq = _mm_mul_ps(iq,q);
+ vcoul = _mm_mul_ps(qq,rinv);
+ fscal = _mm_mul_ps(vcoul,rinv);
+
+ qq = _mm_mul_ps(qq,neg);
+ qq = _mm_mul_ps(isaprod,qq);
+
+ gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
+ rinvsq = _mm_mul_ps(rinv,rinv);
+ r = _mm_mul_ps(rsq,rinv);
+ rt = _mm_mul_ps(r,gbscale);
+
+ n0 = _mm_cvttps_epi32(rt);
+ n0f = _mm_cvtepi32_ps(n0);
+ eps = _mm_sub_ps(rt,n0f);
+
+ eps2 = _mm_mul_ps(eps,eps);
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_ps */
+ xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0));
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2));
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+ vgb = _mm_mul_ps(qq,VV);
+ fijC = _mm_mul_ps(qq,FF);
+ fijC = _mm_mul_ps(fijC,gbscale);
+
+ dvdatmp = _mm_mul_ps(fijC,r);
+ dvdatmp = _mm_add_ps(vgb,dvdatmp);
+ dvdatmp = _mm_mul_ps(half,dvdatmp);
+ dvdatmp = _mm_mul_ps(neg,dvdatmp);
+
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+
+ xmm1 = _mm_mul_ps(dvdatmp,isaj);
+ xmm1 = _mm_mul_ps(xmm1,isaj);
+ dvdaj = _mm_add_ps(dvdaj,xmm1);
+
+ vcoul = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
+ vgb = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
+
+ vctot = _mm_add_ps(vctot,vcoul);
+ vgbtot = _mm_add_ps(vgbtot,vgb);
+
+ /* Calculate VDW table index */
+ rt = _mm_mul_ps(r,tabscale_sse);
+ n0 = _mm_cvttps_epi32(rt);
+ n0f = _mm_cvtepi32_ps(n0);
+ eps = _mm_sub_ps(rt,n0f);
+ eps2 = _mm_mul_ps(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- xmm1 = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,xmm1);
-
- vvdw6 = _mm_mul_sd(c6,VV);
- fijD = _mm_mul_sd(c6,FF);
-
- /* Dispersion */
- Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
- F = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(Y,F);
- G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
- H = _mm_setzero_pd();
- GMX_MM_TRANSPOSE2_PD(G,H);
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- xmm1 = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,xmm1);
-
- vvdw12 = _mm_mul_sd(c12,VV);
- fijR = _mm_mul_sd(c12,FF);
-
- vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
- vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
-
- xmm1 = _mm_add_sd(fijD,fijR);
- xmm1 = _mm_mul_sd(xmm1,tabscale);
- xmm1 = _mm_add_sd(xmm1,fijC);
- xmm1 = _mm_sub_sd(xmm1,fscal);
- fscal = _mm_mul_sd(xmm1,neg);
- fscal = _mm_mul_sd(fscal,rinv);
-
- /***********************************/
- /* INTERACTION SECTION ENDS HERE */
- /***********************************/
-
- /* Calculate temporary vectorial force */
- tx = _mm_mul_sd(fscal,dx);
- ty = _mm_mul_sd(fscal,dy);
- tz = _mm_mul_sd(fscal,dz);
-
- /* Increment i atom force */
- fix = _mm_add_sd(fix,tx);
- fiy = _mm_add_sd(fiy,ty);
- fiz = _mm_add_sd(fiz,tz);
-
- /* Store j forces back */
- GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
+ /* Tabulated VdW interaction - disperion */
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+
+ Vvdw6 = _mm_mul_ps(c6,VV);
+ fijD = _mm_mul_ps(c6,FF);
+
+ /* Tabulated VdW interaction - repulsion */
+ nnn = _mm_add_epi32(nnn,four);
+
+ xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
+ xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
+ xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
+ xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
+
+ /* transpose 4*4 */
+ xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
+ xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
+ xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
+ xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
+
+ Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
+ F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
+ G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
+ H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
+
+ Geps = _mm_mul_ps(G,eps);
+ Heps2 = _mm_mul_ps(H,eps2);
+ Fp = _mm_add_ps(F,Geps);
+ Fp = _mm_add_ps(Fp,Heps2);
+ VV = _mm_mul_ps(Fp,eps);
+ VV = _mm_add_ps(Y,VV);
+ xmm1 = _mm_mul_ps(two,Heps2);
+ FF = _mm_add_ps(Fp,Geps);
+ FF = _mm_add_ps(FF,xmm1);
+
+ Vvdw12 = _mm_mul_ps(c12,VV);
+ fijR = _mm_mul_ps(c12,FF);
+
+ Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
+ Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
+
+ xmm1 = _mm_add_ps(fijD,fijR);
+ xmm1 = _mm_mul_ps(xmm1,tabscale_sse);
+ xmm1 = _mm_add_ps(xmm1,fijC);
+ xmm1 = _mm_sub_ps(xmm1,fscal);
+ fscal = _mm_mul_ps(xmm1,neg);
+ fscal = _mm_mul_ps(fscal,rinv);
+
+ t1 = _mm_mul_ps(fscal,dx1);
+ t2 = _mm_mul_ps(fscal,dy1);
+ t3 = _mm_mul_ps(fscal,dz1);
+
+ if(offset==1)
+ {
+ _mm_store_ss(dvda+jnr,dvdaj);
+
+ xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (faction+j3));
+ xmm7 = _mm_load1_ps(faction+j3+2);
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(1,1,1,1));
+
+ xmm5 = _mm_sub_ps(xmm5,t1);
+ xmm6 = _mm_sub_ps(xmm6,t2);
+ xmm7 = _mm_sub_ps(xmm7,t3);
+
+ _mm_store_ss(faction+j3 , xmm5);
+ _mm_store_ss(faction+j3+1,xmm6);
+ _mm_store_ss(faction+j3+2,xmm7);
+ }
+ else if(offset==2)
+ {
+ _mm_store_ss(dvda+jnr,dvdaj);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(dvda+jnr2,xmm1);
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));
+
+ xmm5 = _mm_load1_ps(faction+j3+2);
+ xmm6 = _mm_load1_ps(faction+j23+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+
+ xmm5 = _mm_sub_ps(xmm5, t1);
+ xmm6 = _mm_sub_ps(xmm6, t2);
+ xmm7 = _mm_sub_ps(xmm7, t3);
+
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
+
+ _mm_storel_pi( (__m64 *) (faction+j3), xmm5);
+ _mm_storeh_pi( (__m64 *) (faction+j23), xmm5);
+
+ _mm_store_ss(faction+j3+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j23+2,xmm7);
+ }
+ else
+ {
+ _mm_store_ss(dvda+jnr,dvdaj);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(dvda+jnr2,xmm1);
+ xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2));
+ _mm_store_ss(dvda+jnr3,xmm1);
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));
+ xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33));
+
+ xmm5 = _mm_load1_ps(faction+j3+2);
+ xmm6 = _mm_load1_ps(faction+j23+2);
+ xmm7 = _mm_load1_ps(faction+j33+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0));
+ xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2));
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+
+ xmm5 = _mm_sub_ps(xmm5, t1);
+ xmm6 = _mm_sub_ps(xmm6, t2);
+ xmm7 = _mm_sub_ps(xmm7, t3);
+
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
+ xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
+ xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
+ xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
+
+ _mm_storel_pi( (__m64 *) (faction+j3), xmm1);
+ _mm_storeh_pi( (__m64 *) (faction+j23), xmm1);
+ _mm_storel_pi( (__m64 *) (faction+j33), xmm2);
+
+ _mm_store_ss(faction+j3+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j23+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(faction+j33+2,xmm7);
+ }
+
+ t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
+ t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
+ t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
}
- dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
- gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+ t1 = _mm_movehl_ps(t1,fix);
+ t2 = _mm_movehl_ps(t2,fiy);
+ t3 = _mm_movehl_ps(t3,fiz);
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ t1 = _mm_shuffle_ps(fix,fix,_MM_SHUFFLE(1,1,1,1));
+ t2 = _mm_shuffle_ps(fiy,fiy,_MM_SHUFFLE(1,1,1,1));
+ t3 = _mm_shuffle_ps(fiz,fiz,_MM_SHUFFLE(1,1,1,1));
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ xmm2 = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
+ xmm2 = _mm_movelh_ps(xmm2,fiz);
+ xmm2 = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
+
+ /* load i force from memory */
+ xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));
+ xmm5 = _mm_load1_ps(faction+ii3+2);
+ xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
+
+ /* add to i force */
+ xmm4 = _mm_add_ps(xmm4,xmm2);
+
+ /* store i force to memory */
+ _mm_storel_pi( (__m64 *) (faction+ii3),xmm4);
+ xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss(faction+ii3+2,xmm4);
+
+ /* Load, add and store i shift forces */
+ xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (fshift+is3));
+ xmm5 = _mm_load1_ps(fshift+is3+2);
+ xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
+
+ xmm4 = _mm_add_ps(xmm4,xmm2);
+
+ _mm_storel_pi( (__m64 *) (fshift+is3),xmm4);
+ xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
+ _mm_store_ss(fshift+is3+2,xmm4);
+
+ /* Coulomb potential */
+ ggid = gid[n];
+
+ vcoul = _mm_movehl_ps(vcoul,vctot);
+ vctot = _mm_add_ps(vctot,vcoul);
+ vcoul = _mm_shuffle_ps(vctot,vctot,_MM_SHUFFLE(1,1,1,1));
+ vctot = _mm_add_ss(vctot,vcoul);
+
+ _mm_store_ss(&vct,vctot);
+ Vc[ggid] = Vc[ggid] + vct;
+
+ Vvdwtmp = _mm_movehl_ps(Vvdwtmp,Vvdwtot);
+ Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
+ Vvdwtmp = _mm_shuffle_ps(Vvdwtot,Vvdwtot,_MM_SHUFFLE(1,1,1,1));
+ Vvdwtot = _mm_add_ss(Vvdwtot,Vvdwtmp);
+
+ _mm_store_ss(&vdwt,Vvdwtot);
+ Vvdw[ggid] = Vvdw[ggid] + vdwt;
+
+ /* dvda */
+ dvdatmp = _mm_movehl_ps(dvdatmp,dvdasum);
+ dvdasum = _mm_add_ps(dvdasum,dvdatmp);
+ dvdatmp = _mm_shuffle_ps(dvdasum,dvdasum,_MM_SHUFFLE(1,1,1,1));
+ dvdasum = _mm_add_ss(dvdasum,dvdatmp);
+
+ _mm_store_ss(&dva,dvdasum);
+ dvda[ii] = dvda[ii] + dva*isai_f*isai_f;
+
+ /* Store the GB potential to the work array */
+ vgb = _mm_movehl_ps(vgb,vgbtot);
+ vgbtot = _mm_add_ps(vgbtot,vgb);
+ vgb = _mm_shuffle_ps(vgbtot,vgbtot,_MM_SHUFFLE(1,1,1,1));
+ vgbtot = _mm_add_ss(vgbtot,vgb);
+
+ _mm_store_ss(&vgbt,vgbtot);
+ gpol[ggid] = gpol[ggid] + vgbt;
+ }
+
+ *outeriter = nri;
+ *inneriter = nj1;
+}
+
+
+/*
+ * Gromacs nonbonded kernel nb_kernel430nf
+ * Coulomb interaction: Generalized-Born
+ * VdW interaction: Tabulated
+ * water optimization: No
+ * Calculate forces: no
+ */
+void nb_kernel430nf_x86_64_sse(
+ int * p_nri,
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ float * shiftvec,
+ float * fshift,
+ int * gid,
+ float * pos,
+ float * faction,
+ float * charge,
+ float * p_facel,
+ float * p_krf,
+ float * p_crf,
+ float * Vc,
+ int * type,
+ int * p_ntype,
+ float * vdwparam,
+ float * Vvdw,
+ float * p_tabscale,
+ float * VFtab,
+ float * invsqrta,
+ float * dvda,
+ float * p_gbtabscale,
+ float * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ float * work)
+{
+ int nri,ntype,nthreads;
+ float facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
+ int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
+ float shX,shY,shZ;
+ float iq;
+ float qq,vcoul,vctot;
+ int nti;
+ int tj;
+ float Vvdw6,Vvdwtot;
+ float Vvdw12;
+ float r,rt,eps,eps2;
+ int n0,nnn;
+ float Y,F,Geps,Heps2,Fp,VV;
+ float isai,isaj,isaprod,gbscale;
+ float ix1,iy1,iz1;
+ float jx1,jy1,jz1;
+ float dx11,dy11,dz11,rsq11,rinv11;
+ float c6,c12;
+
+ nri = *p_nri;
+ ntype = *p_ntype;
+ nthreads = *p_nthreads;
+ facel = *p_facel;
+ krf = *p_krf;
+ crf = *p_crf;
+ tabscale = *p_tabscale;
+ gbtabscale = *p_gbtabscale;
+ nj1 = 0;
+
+ for(n=0; (n<nri); n++)
+ {
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
+ ix1 = shX + pos[ii3+0];
+ iy1 = shY + pos[ii3+1];
+ iz1 = shZ + pos[ii3+2];
+ iq = facel*charge[ii];
+ isai = invsqrta[ii];
+ nti = 2*ntype*type[ii];
+ vctot = 0;
+ Vvdwtot = 0;
- ggid = gid[n];
+ for(k=nj0; (k<nj1); k++)
+ {
+ jnr = jjnr[k];
+ j3 = 3*jnr;
+ jx1 = pos[j3+0];
+ jy1 = pos[j3+1];
+ jz1 = pos[j3+2];
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+ rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
+ rinv11 = gmx_invsqrt(rsq11);
+ isaj = invsqrta[jnr];
+ isaprod = isai*isaj;
+ qq = iq*charge[jnr];
+ vcoul = qq*rinv11;
+ qq = isaprod*(-qq);
+ gbscale = isaprod*gbtabscale;
+ tj = nti+2*type[jnr];
+ c6 = vdwparam[tj];
+ c12 = vdwparam[tj+1];
+ r = rsq11*rinv11;
+ rt = r*gbscale;
+ n0 = rt;
+ eps = rt-n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Y = GBtab[nnn];
+ F = GBtab[nnn+1];
+ Geps = eps*GBtab[nnn+2];
+ Heps2 = eps2*GBtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ vgb = qq*VV;
+ vctot = vctot + vcoul;
+ r = rsq11*rinv11;
+ rt = r*tabscale;
+ n0 = rt;
+ eps = rt-n0;
+ eps2 = eps*eps;
+ nnn = 8*n0;
+ Y = VFtab[nnn];
+ F = VFtab[nnn+1];
+ Geps = eps*VFtab[nnn+2];
+ Heps2 = eps2*VFtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ Vvdw6 = c6*VV;
+ nnn = nnn+4;
+ Y = VFtab[nnn];
+ F = VFtab[nnn+1];
+ Geps = eps*VFtab[nnn+2];
+ Heps2 = eps2*VFtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ Vvdw12 = c12*VV;
+ Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12;
+ }
- gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
- gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
- }
+ ggid = gid[n];
+ Vc[ggid] = Vc[ggid] + vctot;
+ Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
+ }
- *outeriter = nri;
- *inneriter = nj1;
+ *outeriter = nri;
+ *inneriter = nj1;
}
+
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
+#include "nb_kernel400_ia32_sse2.h"
+
-
void nb_kernel400_ia32_sse2(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ double * shiftvec,
+ double * fshift,
+ int * gid,
+ double * pos,
+ double * faction,
+ double * charge,
+ double * p_facel,
+ double * p_krf,
+ double * p_crf,
+ double * vc,
+ int * type,
+ int * p_ntype,
+ double * vdwparam,
+ double * vvdw,
+ double * p_tabscale,
+ double * VFtab,
+ double * invsqrta,
+ double * dvda,
+ double * p_gbtabscale,
+ double * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ double * work)
{
- int nri,ntype,nthreads,offset;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
- double shX,shY,shZ,isai_d,dva;
+ int nri,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int jnrA,jnrB;
+ int j3A,j3B;
gmx_gbdata_t *gbdata;
- double * gpol;
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128i n0, nnn;
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d fac,tabscale,gbtabscale,gbfactor;
- __m128i n0,nnn;
-
- const __m128d neg = {-1.0f,-1.0f};
- const __m128d zero = {0.0f,0.0f};
- const __m128d half = {0.5f,0.5f};
- const __m128d two = {2.0f,2.0f};
- const __m128d three = {3.0f,3.0f};
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
-
+
nri = *p_nri;
- ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
-
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
- gbfactor = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-
- /* Keep compiler happy */
- dvdatmp = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
-
- jnr1=jnr2=0;
- j13=j23=0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
-
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
-
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
-
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
-
- for(k=nj0;k<nj1-offset; k+=2)
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ vctot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,neg);
- qq = _mm_mul_pd(qq,gbfactor);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- fscal = _mm_sub_pd(fijC,fscal);
- fscal = _mm_mul_pd(fscal,neg);
- fscal = _mm_mul_pd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storeh_pd(faction+j23+2,xmm7);
- }
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
+ }
+
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
+ jnrA = jjnr[k];
+ j3A = jnrA * 3;
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,neg);
- qq = _mm_mul_sd(qq,gbfactor);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- fscal = _mm_sub_sd(fijC,fscal);
- fscal = _mm_mul_sd(fscal,neg);
- fscal = _mm_mul_sd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(iq,jq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_sd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_sd(vgbtot, vgb);
+
+ dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+ fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* Load i shift forces from memory */
- xmm1 = _mm_load_sd(fshift+is3);
- xmm2 = _mm_load_sd(fshift+is3+1);
- xmm3 = _mm_load_sd(fshift+is3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i shift forces to memory */
- _mm_store_sd(fshift+is3,fix);
- _mm_store_sd(fshift+is3+1,fiy);
- _mm_store_sd(fshift+is3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_1pot_pd(vctot,vc+ggid);
+ gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
}
-
+
*outeriter = nri;
- *inneriter = nj1;
-
+ *inneriter = nj1;
}
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
+#include "nb_kernel410_ia32_sse2.h"
+
+
+
void nb_kernel410_ia32_sse2(int * p_nri,
int * iinr,
int * jindex,
double * p_facel,
double * p_krf,
double * p_crf,
- double * Vc,
+ double * vc,
int * type,
int * p_ntype,
double * vdwparam,
- double * Vvdw,
+ double * vvdw,
double * p_tabscale,
double * VFtab,
double * invsqrta,
int * inneriter,
double * work)
{
- int nri,ntype,nthreads,offset,tj,tj2,nti;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,nt1,nt2;
- double shX,shY,shZ,isai_d,dva,vgbt;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int offset,nti;
+ int jnrA,jnrB;
+ int j3A,j3B;
+ int tjA,tjB;
gmx_gbdata_t *gbdata;
- double * gpol;
-
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,rinvsq,rinvsix;
- __m128d fac,tabscale,gbtabscale,gbfactor;
- __m128i n0,nnn;
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale,c6,c12;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d rinvsix,vvdw6,vvdw12;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128i n0, nnn;
- const __m128d neg = {-1.0f,-1.0f};
- const __m128d zero = {0.0f,0.0f};
- const __m128d half = {0.5f,0.5f};
- const __m128d two = {2.0f,2.0f};
- const __m128d three = {3.0f,3.0f};
- const __m128d six = {6.0f,6.0f};
- const __m128d twelwe = {12.0f,12.0f};
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
+ const __m128d six = _mm_set1_pd(6.0);
+ const __m128d twelve = _mm_set1_pd(12.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
nri = *p_nri;
ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
-
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
- gbfactor = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-
- /* Keep compiler happy */
- Vvdwtmp = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
- dvdatmp = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
- xmm1 = _mm_setzero_pd();
- xmm2 = _mm_setzero_pd();
- xmm3 = _mm_setzero_pd();
- xmm4 = _mm_setzero_pd();
- jnr1 = jnr2 = 0;
- j13 = j23 = 0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
-
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
-
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
-
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
-
- nti = 2*ntype*type[ii];
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ nti = 2*ntype*type[ii];
- for(k=nj0;k<nj1-offset; k+=2)
+ vctot = _mm_setzero_pd();
+ vvdwtot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,gbfactor);
- qq = _mm_mul_pd(qq,neg);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
- tj2 = nti+2*type[jnr2];
-
- xmm1 = _mm_loadu_pd(vdwparam+tj);
- xmm2 = _mm_loadu_pd(vdwparam+tj2);
- c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-
- rinvsq = _mm_mul_pd(rinv,rinv);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
-
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- /* VdW interaction */
- rinvsix = _mm_mul_pd(rinvsq,rinvsq);
- rinvsix = _mm_mul_pd(rinvsix,rinvsq);
-
- Vvdw6 = _mm_mul_pd(c6,rinvsix);
- Vvdw12 = _mm_mul_pd(c12,rinvsix);
- Vvdw12 = _mm_mul_pd(Vvdw12,rinvsix);
- Vvdwtmp = _mm_sub_pd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-
- xmm1 = _mm_mul_pd(twelwe,Vvdw12);
- xmm2 = _mm_mul_pd(six,Vvdw6);
- xmm1 = _mm_sub_pd(xmm1,xmm2);
- xmm1 = _mm_mul_pd(xmm1,rinvsq);
-
- /* Scalar force */
- fscal = _mm_sub_pd(fijC,fscal);
- fscal = _mm_mul_pd(fscal,rinv);
- fscal = _mm_sub_pd(xmm1,fscal);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storeh_pd(faction+j23+2,xmm7);
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ rinvsix = _mm_mul_pd(rinvsq,rinvsq);
+ rinvsix = _mm_mul_pd(rinvsix,rinvsq);
+
+ vvdw6 = _mm_mul_pd(c6,rinvsix);
+ vvdw12 = _mm_mul_pd(c12, _mm_mul_pd(rinvsix,rinvsix));
+ vvdwtot = _mm_add_pd(vvdwtot,_mm_sub_pd(vvdw12,vvdw6));
+
+ fscal = _mm_sub_pd(_mm_mul_pd(rinvsq,
+ _mm_sub_pd(_mm_mul_pd(twelve,vvdw12),
+ _mm_mul_pd(six,vvdw6))),
+ _mm_mul_pd( _mm_sub_pd( fijGB,fscal),rinv ));
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,gbfactor);
- qq = _mm_mul_sd(qq,neg);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
-
- c6 = _mm_load_sd(vdwparam+tj);
- c12 = _mm_load_sd(vdwparam+tj+1);
-
- rinvsq = _mm_mul_sd(rinv,rinv);
-
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- /* VdW interaction */
- rinvsix = _mm_mul_sd(rinvsq,rinvsq);
- rinvsix = _mm_mul_sd(rinvsix,rinvsq);
-
- Vvdw6 = _mm_mul_sd(c6,rinvsix);
- Vvdw12 = _mm_mul_sd(c12,rinvsix);
- Vvdw12 = _mm_mul_sd(Vvdw12,rinvsix);
- Vvdwtmp = _mm_sub_sd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-
- xmm1 = _mm_mul_sd(twelwe,Vvdw12);
- xmm2 = _mm_mul_sd(six,Vvdw6);
- xmm1 = _mm_sub_sd(xmm1,xmm2);
- xmm1 = _mm_mul_sd(xmm1,rinvsq);
-
- /* Scalar force */
- fscal = _mm_sub_sd(fijC,fscal);
- fscal = _mm_mul_sd(fscal,rinv);
- fscal = _mm_sub_sd(xmm1,fscal);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
-
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+ jnrA = jjnr[k];
+
+ j3A = jnrA * 3;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(iq,jq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_sd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_sd(vgbtot, vgb);
+
+ dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+
+ rinvsix = _mm_mul_sd(rinvsq,rinvsq);
+ rinvsix = _mm_mul_sd(rinvsix,rinvsq);
+
+ vvdw6 = _mm_mul_sd(c6,rinvsix);
+ vvdw12 = _mm_mul_sd(c12, _mm_mul_sd(rinvsix,rinvsix));
+ vvdwtot = _mm_add_sd(vvdwtot,_mm_sub_sd(vvdw12,vvdw6));
+
+ fscal = _mm_sub_sd(_mm_mul_sd(rinvsq,
+ _mm_sub_sd(_mm_mul_sd(twelve,vvdw12),
+ _mm_mul_sd(six,vvdw6))),
+ _mm_mul_sd( _mm_sub_sd( fijGB,fscal),rinv ));
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* Load i shift forces from memory */
- xmm1 = _mm_load_sd(fshift+is3);
- xmm2 = _mm_load_sd(fshift+is3+1);
- xmm3 = _mm_load_sd(fshift+is3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i shift forces to memory */
- _mm_store_sd(fshift+is3,fix);
- _mm_store_sd(fshift+is3+1,fiy);
- _mm_store_sd(fshift+is3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* VdW potential */
- Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
- _mm_storeh_pd(&vdwt,Vvdwtot);
- Vvdw[ggid] = Vvdw[ggid] + vdwt;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
+ gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
}
*outeriter = nri;
*inneriter = nj1;
-
-
-
-
-
}
/* get gmx_gbdata_t */
#include "../nb_kerneltype.h"
-
+#include "nb_kernel430_ia32_sse2.h"
void nb_kernel430_ia32_sse2(int * p_nri,
- int * iinr,
- int * jindex,
- int * jjnr,
- int * shift,
- double * shiftvec,
- double * fshift,
- int * gid,
- double * pos,
- double * faction,
- double * charge,
- double * p_facel,
- double * p_krf,
- double * p_crf,
- double * Vc,
- int * type,
- int * p_ntype,
- double * vdwparam,
- double * Vvdw,
- double * p_tabscale,
- double * VFtab,
- double * invsqrta,
- double * dvda,
- double * p_gbtabscale,
- double * GBtab,
- int * p_nthreads,
- int * count,
- void * mtx,
- int * outeriter,
- int * inneriter,
- double * work)
+ int * iinr,
+ int * jindex,
+ int * jjnr,
+ int * shift,
+ double * shiftvec,
+ double * fshift,
+ int * gid,
+ double * pos,
+ double * faction,
+ double * charge,
+ double * p_facel,
+ double * p_krf,
+ double * p_crf,
+ double * vc,
+ int * type,
+ int * p_ntype,
+ double * vdwparam,
+ double * vvdw,
+ double * p_tabscale,
+ double * VFtab,
+ double * invsqrta,
+ double * dvda,
+ double * p_gbtabscale,
+ double * GBtab,
+ int * p_nthreads,
+ int * count,
+ void * mtx,
+ int * outeriter,
+ int * inneriter,
+ double * work)
{
- int nri,ntype,nthreads,offset,tj,tj2,nti;
- int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
- double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
- double shX,shY,shZ,isai_d,dva;
+ int nri,ntype,nthreads;
+ int n,ii,is3,ii3,k,nj0,nj1,ggid;
+ double shX,shY,shZ;
+ int offset,nti;
+ int jnrA,jnrB;
+ int j3A,j3B;
+ int tjA,tjB;
gmx_gbdata_t *gbdata;
- double * gpol;
-
- __m128d ix,iy,iz,jx,jy,jz;
- __m128d dx,dy,dz,t1,t2,t3;
- __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
- __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
- __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
- __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
- __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
- __m128d fac,tabscale,gbtabscale,gbfactor;
- __m128i n0,nnn;
-
- const __m128d neg = {-1.0f,-1.0f};
- const __m128d zero = {0.0f,0.0f};
- const __m128d half = {0.5f,0.5f};
- const __m128d two = {2.0f,2.0f};
- const __m128d three = {3.0f,3.0f};
- const __m128d six = {6.0f,6.0f};
- const __m128d twelwe = {12.0f,12.0f};
+ double * gpol;
+
+ __m128d iq,qq,jq,isai;
+ __m128d ix,iy,iz;
+ __m128d jx,jy,jz;
+ __m128d dx,dy,dz;
+ __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
+ __m128d fix,fiy,fiz,tx,ty,tz,rsq;
+ __m128d rinv,isaj,isaprod;
+ __m128d vcoul,fscal,gbscale,c6,c12;
+ __m128d rinvsq,r,rtab;
+ __m128d eps,Y,F,G,H;
+ __m128d VV,FF,Fp;
+ __m128d vgb,fijGB,dvdatmp;
+ __m128d rinvsix,vvdw6,vvdw12,vvdwtmp;
+ __m128d facel,gbtabscale,dvdaj;
+ __m128d fijD,fijR,fijC;
+ __m128d xmm1,tabscale,eps2;
+ __m128i n0, nnn;
+
- const __m128i four = _mm_set_epi32(4,4,4,4);
+ const __m128d neg = _mm_set1_pd(-1.0);
+ const __m128d zero = _mm_set1_pd(0.0);
+ const __m128d minushalf = _mm_set1_pd(-0.5);
+ const __m128d two = _mm_set1_pd(2.0);
gbdata = (gmx_gbdata_t *)work;
gpol = gbdata->gpol;
-
+
nri = *p_nri;
ntype = *p_ntype;
- nthreads = *p_nthreads;
- facel = *p_facel;
- krf = *p_krf;
- crf = *p_crf;
- tabscl = *p_tabscale;
- gbtabscl = *p_gbtabscale;
- nj1 = 0;
+
+ gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
+ gbtabscale = _mm_load1_pd(p_gbtabscale);
+ facel = _mm_load1_pd(p_facel);
+ tabscale = _mm_load1_pd(p_tabscale);
+
+ nj1 = 0;
+ jnrA = jnrB = 0;
+ j3A = j3B = 0;
+ jx = _mm_setzero_pd();
+ jy = _mm_setzero_pd();
+ jz = _mm_setzero_pd();
+ c6 = _mm_setzero_pd();
+ c12 = _mm_setzero_pd();
- /* Splat variables */
- fac = _mm_load1_pd(&facel);
- tabscale = _mm_load1_pd(&tabscl);
- gbtabscale = _mm_load1_pd(&gbtabscl);
- gbfactor = _mm_set1_pd(((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
-
- /* Keep compiler happy */
- Vvdwtmp = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
- dvdatmp = _mm_setzero_pd();
- dvdaj = _mm_setzero_pd();
- isaj = _mm_setzero_pd();
- vcoul = _mm_setzero_pd();
- vgb = _mm_setzero_pd();
- t1 = _mm_setzero_pd();
- t2 = _mm_setzero_pd();
- t3 = _mm_setzero_pd();
- xmm1 = _mm_setzero_pd();
- xmm2 = _mm_setzero_pd();
- xmm3 = _mm_setzero_pd();
- xmm4 = _mm_setzero_pd();
- jnr1 = jnr2 = 0;
- j13 = j23 = 0;
-
for(n=0;n<nri;n++)
{
- is3 = 3*shift[n];
- shX = shiftvec[is3];
- shY = shiftvec[is3+1];
- shZ = shiftvec[is3+2];
+ is3 = 3*shift[n];
+ shX = shiftvec[is3];
+ shY = shiftvec[is3+1];
+ shZ = shiftvec[is3+2];
+ nj0 = jindex[n];
+ nj1 = jindex[n+1];
+ ii = iinr[n];
+ ii3 = 3*ii;
- nj0 = jindex[n];
- nj1 = jindex[n+1];
- offset = (nj1-nj0)%2;
+ ix = _mm_set1_pd(shX+pos[ii3+0]);
+ iy = _mm_set1_pd(shY+pos[ii3+1]);
+ iz = _mm_set1_pd(shZ+pos[ii3+2]);
+
+ iq = _mm_load1_pd(charge+ii);
+ iq = _mm_mul_pd(iq,facel);
+
+ isai = _mm_load1_pd(invsqrta+ii);
+
+ nti = 2*ntype*type[ii];
- ii = iinr[n];
- ii3 = ii*3;
-
- ix = _mm_set1_pd(shX+pos[ii3+0]);
- iy = _mm_set1_pd(shX+pos[ii3+1]);
- iz = _mm_set1_pd(shX+pos[ii3+2]);
- q = _mm_set1_pd(charge[ii]);
-
- iq = _mm_mul_pd(fac,q);
- isai_d = invsqrta[ii];
- isai = _mm_load1_pd(&isai_d);
-
- nti = 2*ntype*type[ii];
-
- fix = _mm_setzero_pd();
- fiy = _mm_setzero_pd();
- fiz = _mm_setzero_pd();
- dvdasum = _mm_setzero_pd();
- vctot = _mm_setzero_pd();
- vgbtot = _mm_setzero_pd();
- Vvdwtot = _mm_setzero_pd();
-
- for(k=nj0;k<nj1-offset; k+=2)
+ vctot = _mm_setzero_pd();
+ vvdwtot = _mm_setzero_pd();
+ vgbtot = _mm_setzero_pd();
+ dvdasum = _mm_setzero_pd();
+ fix = _mm_setzero_pd();
+ fiy = _mm_setzero_pd();
+ fiz = _mm_setzero_pd();
+
+ for(k=nj0;k<nj1-1; k+=2)
{
- jnr1 = jjnr[k];
- jnr2 = jjnr[k+1];
-
- j13 = jnr1 * 3;
- j23 = jnr2 * 3;
-
- /* Load coordinates */
- xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */
- xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */
-
- xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */
- xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */
-
- /* transpose */
- jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
-
- /* distances */
- dx = _mm_sub_pd(ix,jx);
- dy = _mm_sub_pd(iy,jy);
- dz = _mm_sub_pd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- /* Load invsqrta */
- isaj = _mm_loadl_pd(isaj,invsqrta+jnr1);
- isaj = _mm_loadh_pd(isaj,invsqrta+jnr2);
- isaprod = _mm_mul_pd(isai,isaj);
-
- /* Load charges */
- q = _mm_loadl_pd(q,charge+jnr1);
- q = _mm_loadh_pd(q,charge+jnr2);
- qq = _mm_mul_pd(iq,q);
-
- vcoul = _mm_mul_pd(qq,rinv);
- fscal = _mm_mul_pd(vcoul,rinv);
- qq = _mm_mul_pd(isaprod,qq);
- qq = _mm_mul_pd(qq,neg);
- qq = _mm_mul_pd(qq,gbfactor);
- gbscale = _mm_mul_pd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
- tj2 = nti+2*type[jnr2];
-
- xmm1 = _mm_loadu_pd(vdwparam+tj);
- xmm2 = _mm_loadu_pd(vdwparam+tj2);
- c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
-
- /* Load dvdaj */
- dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1);
- dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2);
-
- /* Calculate GB table index */
- r = _mm_mul_pd(rsq11,rinv);
- rt = _mm_mul_pd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
- eps2 = _mm_mul_pd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
- H = _mm_mul_pd(H,eps2);
- Fp = _mm_add_pd(F,G);
- Fp = _mm_add_pd(Fp,H);
- VV = _mm_mul_pd(Fp,eps);
- VV = _mm_add_pd(Y,VV);
- H = _mm_mul_pd(two,H);
- FF = _mm_add_pd(Fp,G);
- FF = _mm_add_pd(FF,H);
- vgb = _mm_mul_pd(qq,VV);
- fijC = _mm_mul_pd(qq,FF);
- fijC = _mm_mul_pd(fijC,gbscale);
-
- dvdatmp = _mm_mul_pd(fijC,r);
- dvdatmp = _mm_add_pd(vgb,dvdatmp);
- dvdatmp = _mm_mul_pd(dvdatmp,neg);
- dvdatmp = _mm_mul_pd(dvdatmp,half);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_pd(dvdatmp,isaj);
- xmm1 = _mm_mul_pd(xmm1,isaj);
- dvdaj = _mm_add_pd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
- _mm_storeh_pd(dvda+jnr2,dvdaj);
-
- vctot = _mm_add_pd(vctot,vcoul);
- vgbtot = _mm_add_pd(vgbtot,vgb);
-
- /* Calculate VDW table index */
- rt = _mm_mul_pd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_pd(rt,n0d);
+ jnrA = jjnr[k];
+ jnrB = jjnr[k+1];
+
+ j3A = jnrA * 3;
+ j3B = jnrB * 3;
+
+ GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
+
+ dx = _mm_sub_pd(ix,jx);
+ dy = _mm_sub_pd(iy,jy);
+ dz = _mm_sub_pd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_pd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
+ GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+ tjB = nti+2*type[jnrB];
+
+ GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
+
+ isaprod = _mm_mul_pd(isai,isaj);
+ qq = _mm_mul_pd(iq,jq);
+ vcoul = _mm_mul_pd(qq,rinv);
+ fscal = _mm_mul_pd(vcoul,rinv);
+ vctot = _mm_add_pd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
+ gbscale = _mm_mul_pd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_pd(rsq,rinv);
+ rtab = _mm_mul_pd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
+ H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
+ F = _mm_add_pd(F, _mm_add_pd( G , H ) );
+ Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
+ F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
+ vgb = _mm_mul_pd(Y, qq);
+ fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
+
+ dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_pd(vgbtot, vgb);
+
+ dvdasum = _mm_add_pd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
+
+ GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
+
+ /* Calculate VDW table index */
+ rtab = _mm_mul_pd(r,tabscale);
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_pd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Tabulated VdW interaction - dispersion */
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
FF = _mm_add_pd(Fp,G);
FF = _mm_add_pd(FF,xmm1);
- Vvdw6 = _mm_mul_pd(c6,VV);
+ vvdw6 = _mm_mul_pd(c6,VV);
fijD = _mm_mul_pd(c6,FF);
-
- /* Tabulated VdW interaction - repulsion */
- nnn = _mm_add_epi32(nnn,four);
-
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_pd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_pd(G,eps);
H = _mm_mul_pd(H,eps2);
Fp = _mm_add_pd(F,G);
Fp = _mm_add_pd(Fp,H);
FF = _mm_add_pd(Fp,G);
FF = _mm_add_pd(FF,xmm1);
- Vvdw12 = _mm_mul_pd(c12,VV);
+ vvdw12 = _mm_mul_pd(c12,VV);
fijR = _mm_mul_pd(c12,FF);
- Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
-
+ vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
+ vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
+
xmm1 = _mm_add_pd(fijD,fijR);
xmm1 = _mm_mul_pd(xmm1,tabscale);
xmm1 = _mm_add_pd(xmm1,fijC);
xmm1 = _mm_sub_pd(xmm1,fscal);
fscal = _mm_mul_pd(xmm1,neg);
fscal = _mm_mul_pd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_pd(fscal,dx);
- t2 = _mm_mul_pd(fscal,dy);
- t3 = _mm_mul_pd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */
- xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */
-
- xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
- xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
-
- /* transpose */
- xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
- xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
- xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_pd(xmm5,t1);
- xmm6 = _mm_sub_pd(xmm6,t2);
- xmm7 = _mm_sub_pd(xmm7,t3);
-
- xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
- xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
-
- /* store fx and fy */
- _mm_storeu_pd(faction+j13,xmm1);
- _mm_storeu_pd(faction+j23,xmm2);
-
- /* .. then fz */
- _mm_storel_pd(faction+j13+2,xmm7);
- _mm_storel_pd(faction+j23+2,xmm7);
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_pd(fscal,dx);
+ ty = _mm_mul_pd(fscal,dy);
+ tz = _mm_mul_pd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_pd(fix,tx);
+ fiy = _mm_add_pd(fiy,ty);
+ fiz = _mm_add_pd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
}
/* In double precision, offset can only be either 0 or 1 */
- if(offset!=0)
+ if(k<nj1)
{
- jnr1 = jjnr[k];
- j13 = jnr1*3;
-
- jx = _mm_load_sd(pos+j13);
- jy = _mm_load_sd(pos+j13+1);
- jz = _mm_load_sd(pos+j13+2);
-
- isaj = _mm_load_sd(invsqrta+jnr1);
- isaprod = _mm_mul_sd(isai,isaj);
- dvdaj = _mm_load_sd(dvda+jnr1);
- q = _mm_load_sd(charge+jnr1);
- qq = _mm_mul_sd(iq,q);
-
- dx = _mm_sub_sd(ix,jx);
- dy = _mm_sub_sd(iy,jy);
- dz = _mm_sub_sd(iz,jz);
-
- rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
- rinv = gmx_mm_invsqrt_pd(rsq11);
-
- vcoul = _mm_mul_sd(qq,rinv);
- fscal = _mm_mul_sd(vcoul,rinv);
- qq = _mm_mul_sd(isaprod,qq);
- qq = _mm_mul_sd(qq,neg);
- qq = _mm_mul_sd(qq,gbfactor);
- gbscale = _mm_mul_sd(isaprod,gbtabscale);
-
- /* Load VdW parameters */
- tj = nti+2*type[jnr1];
-
- c6 = _mm_load_sd(vdwparam+tj);
- c12 = _mm_load_sd(vdwparam+tj+1);
-
- /* Calculate GB table index */
- r = _mm_mul_sd(rsq11,rinv);
- rt = _mm_mul_sd(r,gbscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
- eps2 = _mm_mul_sd(eps,eps);
-
- nnn = _mm_slli_epi64(n0,2);
-
- xmm1 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0)));
- xmm2 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1)));
- xmm3 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,0))+2);
- xmm4 = _mm_load_pd(GBtab+(gmx_mm_extract_epi64(nnn,1))+2);
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0));
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1));
-
- G = _mm_mul_sd(G,eps);
- H = _mm_mul_sd(H,eps2);
- Fp = _mm_add_sd(F,G);
- Fp = _mm_add_sd(Fp,H);
- VV = _mm_mul_sd(Fp,eps);
- VV = _mm_add_sd(Y,VV);
- H = _mm_mul_sd(two,H);
- FF = _mm_add_sd(Fp,G);
- FF = _mm_add_sd(FF,H);
- vgb = _mm_mul_sd(qq,VV);
- fijC = _mm_mul_sd(qq,FF);
- fijC = _mm_mul_sd(fijC,gbscale);
-
- dvdatmp = _mm_mul_sd(fijC,r);
- dvdatmp = _mm_add_sd(vgb,dvdatmp);
- dvdatmp = _mm_mul_sd(dvdatmp,neg);
- dvdatmp = _mm_mul_sd(dvdatmp,half);
- dvdasum = _mm_add_sd(dvdasum,dvdatmp);
-
- xmm1 = _mm_mul_sd(dvdatmp,isaj);
- xmm1 = _mm_mul_sd(xmm1,isaj);
- dvdaj = _mm_add_sd(dvdaj,xmm1);
-
- /* store dvda */
- _mm_storel_pd(dvda+jnr1,dvdaj);
-
- vctot = _mm_add_sd(vctot,vcoul);
- vgbtot = _mm_add_sd(vgbtot,vgb);
-
- /* Calculate VDW table index */
- rt = _mm_mul_sd(r,tabscale);
- n0 = _mm_cvttpd_epi32(rt);
- n0d = _mm_cvtepi32_pd(n0);
- eps = _mm_sub_sd(rt,n0d);
+ jnrA = jjnr[k];
+ j3A = jnrA * 3;
+
+ GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
+
+ dx = _mm_sub_sd(ix,jx);
+ dy = _mm_sub_sd(iy,jy);
+ dz = _mm_sub_sd(iz,jz);
+
+ rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
+
+ rinv = gmx_mm_invsqrt_pd(rsq);
+ rinvsq = _mm_mul_sd(rinv,rinv);
+
+ /***********************************/
+ /* INTERACTION SECTION STARTS HERE */
+ /***********************************/
+ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
+ GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
+
+ /* Lennard-Jones */
+ tjA = nti+2*type[jnrA];
+
+ GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
+
+ isaprod = _mm_mul_sd(isai,isaj);
+ qq = _mm_mul_sd(iq,jq);
+ vcoul = _mm_mul_sd(qq,rinv);
+ fscal = _mm_mul_sd(vcoul,rinv);
+ vctot = _mm_add_sd(vctot,vcoul);
+
+ /* Polarization interaction */
+ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
+ gbscale = _mm_mul_sd(isaprod,gbtabscale);
+
+ /* Calculate GB table index */
+ r = _mm_mul_sd(rsq,rinv);
+ rtab = _mm_mul_sd(r,gbscale);
+
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
+ nnn = _mm_slli_epi32(n0,2);
+
+ /* the tables are 16-byte aligned, so we can use _mm_load_pd */
+ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
+ H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
+ F = _mm_add_sd(F, _mm_add_sd( G , H ) );
+ Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
+ F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
+ vgb = _mm_mul_sd(Y, qq);
+ fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
+
+ dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
+
+ vgbtot = _mm_add_sd(vgbtot, vgb);
+
+ dvdasum = _mm_add_sd(dvdasum, dvdatmp);
+ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
+
+ GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
+
+ /* Calculate VDW table index */
+ rtab = _mm_mul_sd(r,tabscale);
+ n0 = _mm_cvttpd_epi32(rtab);
+ eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
eps2 = _mm_mul_sd(eps,eps);
nnn = _mm_slli_epi32(n0,3);
- /* Tabulated VdW interaction - dispersion */
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_sd(G,eps);
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0)));
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
FF = _mm_add_sd(Fp,G);
FF = _mm_add_sd(FF,xmm1);
- Vvdw6 = _mm_mul_sd(c6,VV);
+ vvdw6 = _mm_mul_sd(c6,VV);
fijD = _mm_mul_sd(c6,FF);
-
- /* Tabulated VdW interaction - repulsion */
- nnn = _mm_add_epi32(nnn,four);
-
- xmm1 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))); /* Y1 F1 */
- xmm2 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))); /* Y2 F2 */
- xmm3 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
- xmm4 = _mm_load_pd(VFtab+(gmx_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
-
- Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
- F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
- G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
- H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
-
- G = _mm_mul_sd(G,eps);
+
+ /* Dispersion */
+ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4);
+ F = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(Y,F);
+ G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6);
+ H = _mm_setzero_pd();
+ GMX_MM_TRANSPOSE2_PD(G,H);
+
+ G = _mm_mul_sd(G,eps);
H = _mm_mul_sd(H,eps2);
Fp = _mm_add_sd(F,G);
Fp = _mm_add_sd(Fp,H);
FF = _mm_add_sd(Fp,G);
FF = _mm_add_sd(FF,xmm1);
- Vvdw12 = _mm_mul_sd(c12,VV);
+ vvdw12 = _mm_mul_sd(c12,VV);
fijR = _mm_mul_sd(c12,FF);
- Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
- Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
-
+ vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
+ vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
+
xmm1 = _mm_add_sd(fijD,fijR);
xmm1 = _mm_mul_sd(xmm1,tabscale);
xmm1 = _mm_add_sd(xmm1,fijC);
xmm1 = _mm_sub_sd(xmm1,fscal);
fscal = _mm_mul_sd(xmm1,neg);
fscal = _mm_mul_sd(fscal,rinv);
-
- /* calculate partial force terms */
- t1 = _mm_mul_sd(fscal,dx);
- t2 = _mm_mul_sd(fscal,dy);
- t3 = _mm_mul_sd(fscal,dz);
-
- /* update the i force */
- fix = _mm_add_sd(fix,t1);
- fiy = _mm_add_sd(fiy,t2);
- fiz = _mm_add_sd(fiz,t3);
-
- /* accumulate forces from memory */
- xmm5 = _mm_load_sd(faction+j13); /* fx */
- xmm6 = _mm_load_sd(faction+j13+1); /* fy */
- xmm7 = _mm_load_sd(faction+j13+2); /* fz */
-
- /* subtract partial forces */
- xmm5 = _mm_sub_sd(xmm5,t1);
- xmm6 = _mm_sub_sd(xmm6,t2);
- xmm7 = _mm_sub_sd(xmm7,t3);
-
- /* store forces */
- _mm_store_sd(faction+j13,xmm5);
- _mm_store_sd(faction+j13+1,xmm6);
- _mm_store_sd(faction+j13+2,xmm7);
+
+ /***********************************/
+ /* INTERACTION SECTION ENDS HERE */
+ /***********************************/
+
+ /* Calculate temporary vectorial force */
+ tx = _mm_mul_sd(fscal,dx);
+ ty = _mm_mul_sd(fscal,dy);
+ tz = _mm_mul_sd(fscal,dz);
+
+ /* Increment i atom force */
+ fix = _mm_add_sd(fix,tx);
+ fiy = _mm_add_sd(fiy,ty);
+ fiz = _mm_add_sd(fiz,tz);
+
+ /* Store j forces back */
+ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
}
- /* fix/fiy/fiz now contain four partial terms, that all should be
- * added to the i particle forces
- */
- t1 = _mm_unpacklo_pd(t1,fix);
- t2 = _mm_unpacklo_pd(t2,fiy);
- t3 = _mm_unpacklo_pd(t3,fiz);
-
- fix = _mm_add_pd(fix,t1);
- fiy = _mm_add_pd(fiy,t2);
- fiz = _mm_add_pd(fiz,t3);
-
- fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
- fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
- fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
-
- /* Load i forces from memory */
- xmm1 = _mm_load_sd(faction+ii3);
- xmm2 = _mm_load_sd(faction+ii3+1);
- xmm3 = _mm_load_sd(faction+ii3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(faction+ii3,fix);
- _mm_store_sd(faction+ii3+1,fiy);
- _mm_store_sd(faction+ii3+2,fiz);
-
- /* Load i shift forces from memory */
- xmm1 = _mm_load_sd(fshift+is3);
- xmm2 = _mm_load_sd(fshift+is3+1);
- xmm3 = _mm_load_sd(fshift+is3+2);
-
- /* Add to i force */
- fix = _mm_add_sd(fix,xmm1);
- fiy = _mm_add_sd(fiy,xmm2);
- fiz = _mm_add_sd(fiz,xmm3);
-
- /* store i forces to memory */
- _mm_store_sd(fshift+is3,fix);
- _mm_store_sd(fshift+is3+1,fiy);
- _mm_store_sd(fshift+is3+2,fiz);
-
- /* now do dvda */
- dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum);
- dvdasum = _mm_add_pd(dvdasum,dvdatmp);
- _mm_storeh_pd(&dva,dvdasum);
- dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
-
- ggid = gid[n];
-
- /* Coulomb potential */
- vcoul = _mm_unpacklo_pd(vcoul,vctot);
- vctot = _mm_add_pd(vctot,vcoul);
- _mm_storeh_pd(&vct,vctot);
- Vc[ggid] = Vc[ggid] + vct;
-
- /* VdW potential */
- Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
- Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
- _mm_storeh_pd(&vdwt,Vvdwtot);
- Vvdw[ggid] = Vvdw[ggid] + vdwt;
-
- /* GB potential */
- vgb = _mm_unpacklo_pd(vgb,vgbtot);
- vgbtot = _mm_add_pd(vgbtot,vgb);
- _mm_storeh_pd(&vgbt,vgbtot);
- gpol[ggid] = gpol[ggid] + vgbt;
+ dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
+ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
+
+ ggid = gid[n];
+
+ gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
+ gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
}
-
+
*outeriter = nri;
*inneriter = nj1;
-
-
-
-
-
}
+