2 * Copyright (c) Erik Lindahl, David van der Spoel 2003
4 * This file is generated automatically at compile time
5 * by the program mknb in the Gromacs distribution.
7 * Options used when generation this file:
11 * Software invsqrt: yes
21 #include <emmintrin.h>
23 #include <gmx_sse2_double.h>
25 /* get gmx_gbdata_t */
26 #include "../nb_kerneltype.h"
29 void nb_kernel400_sse2_double(int * p_nri,
52 double * p_gbtabscale,
62 int n,ii,is3,ii3,k,nj0,nj1,ggid;
69 __m128d iq,qq,jq,isai;
73 __m128d vctot,vgbtot,dvdasum,gbfactor;
74 __m128d fix,fiy,fiz,tx,ty,tz,rsq;
75 __m128d rinv,isaj,isaprod;
76 __m128d vcoul,fscal,gbscale;
77 __m128d rinvsq,r,rtab;
79 __m128d vgb,fijGB,dvdatmp;
80 __m128d facel,gbtabscale,dvdaj;
83 const __m128d neg = _mm_set1_pd(-1.0);
84 const __m128d zero = _mm_set1_pd(0.0);
85 const __m128d minushalf = _mm_set1_pd(-0.5);
86 const __m128d two = _mm_set1_pd(2.0);
88 gbdata = (gmx_gbdata_t *)work;
93 gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
94 gbtabscale = _mm_load1_pd(p_gbtabscale);
95 facel = _mm_load1_pd(p_facel);
100 jx = _mm_setzero_pd();
101 jy = _mm_setzero_pd();
102 jz = _mm_setzero_pd();
108 shY = shiftvec[is3+1];
109 shZ = shiftvec[is3+2];
115 ix = _mm_set1_pd(shX+pos[ii3+0]);
116 iy = _mm_set1_pd(shY+pos[ii3+1]);
117 iz = _mm_set1_pd(shZ+pos[ii3+2]);
119 iq = _mm_load1_pd(charge+ii);
120 iq = _mm_mul_pd(iq,facel);
122 isai = _mm_load1_pd(invsqrta+ii);
124 vctot = _mm_setzero_pd();
125 vgbtot = _mm_setzero_pd();
126 dvdasum = _mm_setzero_pd();
127 fix = _mm_setzero_pd();
128 fiy = _mm_setzero_pd();
129 fiz = _mm_setzero_pd();
131 for(k=nj0;k<nj1-1; k+=2)
139 GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
141 dx = _mm_sub_pd(ix,jx);
142 dy = _mm_sub_pd(iy,jy);
143 dz = _mm_sub_pd(iz,jz);
145 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
147 rinv = gmx_mm_invsqrt_pd(rsq);
148 rinvsq = _mm_mul_pd(rinv,rinv);
150 /***********************************/
151 /* INTERACTION SECTION STARTS HERE */
152 /***********************************/
153 GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
154 GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
156 isaprod = _mm_mul_pd(isai,isaj);
157 qq = _mm_mul_pd(iq,jq);
158 vcoul = _mm_mul_pd(qq,rinv);
159 fscal = _mm_mul_pd(vcoul,rinv);
160 vctot = _mm_add_pd(vctot,vcoul);
162 /* Polarization interaction */
163 qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
164 gbscale = _mm_mul_pd(isaprod,gbtabscale);
166 /* Calculate GB table index */
167 r = _mm_mul_pd(rsq,rinv);
168 rtab = _mm_mul_pd(r,gbscale);
170 n0 = _mm_cvttpd_epi32(rtab);
171 eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
172 nnn = _mm_slli_epi32(n0,2);
174 /* the tables are 16-byte aligned, so we can use _mm_load_pd */
175 Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
176 F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
177 GMX_MM_TRANSPOSE2_PD(Y,F);
178 G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
179 H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
180 GMX_MM_TRANSPOSE2_PD(G,H);
182 G = _mm_mul_pd(G,eps);
183 H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
184 F = _mm_add_pd(F, _mm_add_pd( G , H ) );
185 Y = _mm_add_pd(Y, _mm_mul_pd(F, eps));
186 F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
187 vgb = _mm_mul_pd(Y, qq);
188 fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
190 dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
192 vgbtot = _mm_add_pd(vgbtot, vgb);
194 dvdasum = _mm_add_pd(dvdasum, dvdatmp);
195 dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
197 GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
199 fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
201 /***********************************/
202 /* INTERACTION SECTION ENDS HERE */
203 /***********************************/
205 /* Calculate temporary vectorial force */
206 tx = _mm_mul_pd(fscal,dx);
207 ty = _mm_mul_pd(fscal,dy);
208 tz = _mm_mul_pd(fscal,dz);
210 /* Increment i atom force */
211 fix = _mm_add_pd(fix,tx);
212 fiy = _mm_add_pd(fiy,ty);
213 fiz = _mm_add_pd(fiz,tz);
215 /* Store j forces back */
216 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
219 /* In double precision, offset can only be either 0 or 1 */
225 GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
227 dx = _mm_sub_sd(ix,jx);
228 dy = _mm_sub_sd(iy,jy);
229 dz = _mm_sub_sd(iz,jz);
231 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
233 rinv = gmx_mm_invsqrt_pd(rsq);
234 rinvsq = _mm_mul_sd(rinv,rinv);
236 /* These reason for zeroing these variables here is for fixing bug 585
237 * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
238 * and r1=0, but it should be r1=a[1].
239 * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
240 * To work around it, we zero these variables and use _mm_add_pd (**) instead
241 * Note that the only variables that get affected are the energies since
242 * the total sum needs to be correct
244 vgb = _mm_setzero_pd();
245 vcoul = _mm_setzero_pd();
246 dvdatmp = _mm_setzero_pd();
248 /***********************************/
249 /* INTERACTION SECTION STARTS HERE */
250 /***********************************/
251 GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
252 GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
254 isaprod = _mm_mul_sd(isai,isaj);
255 qq = _mm_mul_sd(jq,iq);
256 vcoul = _mm_mul_sd(qq,rinv);
257 fscal = _mm_mul_sd(vcoul,rinv);
258 vctot = _mm_add_pd(vctot,vcoul); /* (**) */
260 /* Polarization interaction */
261 qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
262 gbscale = _mm_mul_sd(isaprod,gbtabscale);
264 /* Calculate GB table index */
265 r = _mm_mul_sd(rsq,rinv);
266 rtab = _mm_mul_sd(r,gbscale);
268 n0 = _mm_cvttpd_epi32(rtab);
269 eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
270 nnn = _mm_slli_epi32(n0,2);
272 /* the tables are 16-byte aligned, so we can use _mm_load_pd */
273 Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0)));
274 F = _mm_setzero_pd();
275 GMX_MM_TRANSPOSE2_PD(Y,F);
276 G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2);
277 H = _mm_setzero_pd();
278 GMX_MM_TRANSPOSE2_PD(G,H);
280 G = _mm_mul_sd(G,eps);
281 H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
282 F = _mm_add_sd(F, _mm_add_sd( G , H ) );
283 Y = _mm_add_sd(Y, _mm_mul_sd(F, eps));
284 F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
285 vgb = _mm_mul_sd(Y, qq);
286 fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
288 dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
290 vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */
292 dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
293 dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
295 GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
297 fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
299 /***********************************/
300 /* INTERACTION SECTION ENDS HERE */
301 /***********************************/
303 /* Calculate temporary vectorial force */
304 tx = _mm_mul_sd(fscal,dx);
305 ty = _mm_mul_sd(fscal,dy);
306 tz = _mm_mul_sd(fscal,dz);
308 /* Increment i atom force */
309 fix = _mm_add_sd(fix,tx);
310 fiy = _mm_add_sd(fiy,ty);
311 fiz = _mm_add_sd(fiz,tz);
313 /* Store j forces back */
314 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
317 dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
318 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
322 gmx_mm_update_1pot_pd(vctot,vc+ggid);
323 gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
324 gmx_mm_update_1pot_pd(dvdasum,dvda+ii);