2 * Copyright (c) Erik Lindahl, David van der Spoel 2003
4 * This file is generated automatically at compile time
5 * by the program mknb in the Gromacs distribution.
7 * Options used when generation this file:
11 * Software invsqrt: yes
22 #include <emmintrin.h>
24 #include <gmx_sse2_single.h>
26 /* get gmx_gbdata_t */
27 #include "../nb_kerneltype.h"
30 void nb_kernel430_sse2_single(int * p_nri,
62 int nri,ntype,nthreads;
63 float facel,krf,crf,tabscale,gbtabscale;
64 int n,ii,is3,ii3,k,nj0,nj1,ggid;
67 int jnr,jnr2,jnr3,jnr4,j3,j23,j33,j43;
75 __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
77 __m128 vctot,Vvdwtot,dvdasum;
78 __m128 fix,fiy,fiz,rsq;
80 __m128 rinv,isaj,isaprod;
81 __m128 vcoul,fscal,gbscale,c6,c12;
82 __m128 rinvsq,dvdaj,r,rt;
83 __m128 eps,eps2,Y,F,G,H,Geps,Heps2;
84 __m128 Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp;
85 __m128 rinvsix,Vvdw6,Vvdw12,Vvdwtmp,vgbtot,n0f;
86 __m128 fac_sse,tabscale_sse,gbtabscale_sse;
89 const __m128 neg = _mm_set1_ps(-1.0f);
90 const __m128 zero = _mm_set1_ps(0.0f);
91 const __m128 half = _mm_set1_ps(0.5f);
92 const __m128 two = _mm_set1_ps(2.0f);
93 const __m128 three = _mm_set1_ps(3.0f);
94 const __m128 six = _mm_set1_ps(6.0f);
95 const __m128 twelwe = _mm_set1_ps(12.0f);
97 __m128i four = _mm_set1_epi32(4);
98 __m128i maski = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
99 __m128i mask = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);
101 float vct,vdwt,vgbt,dva,isai_f;
103 gbdata = (gmx_gbdata_t *)work;
108 nthreads = *p_nthreads;
109 facel = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));
112 tabscale = *p_tabscale;
113 gbtabscale = *p_gbtabscale;
116 /* Splat variables */
117 fac_sse = _mm_load1_ps(&facel);
118 tabscale_sse = _mm_load1_ps(&tabscale);
119 gbtabscale_sse = _mm_load1_ps(&gbtabscale);
122 /* Keep the compiler happy */
123 Vvdwtmp = _mm_setzero_ps();
124 dvdatmp = _mm_setzero_ps();
125 vcoul = _mm_setzero_ps();
126 vgb = _mm_setzero_ps();
127 t1 = _mm_setzero_ps();
128 t2 = _mm_setzero_ps();
129 t3 = _mm_setzero_ps();
130 xmm1 = _mm_setzero_ps();
131 xmm2 = _mm_setzero_ps();
132 xmm3 = _mm_setzero_ps();
133 xmm4 = _mm_setzero_ps();
137 for(n=0; (n<nri); n++)
141 shY = shiftvec[is3+1];
142 shZ = shiftvec[is3+2];
148 ix = _mm_set1_ps(shX+pos[ii3+0]);
149 iy = _mm_set1_ps(shY+pos[ii3+1]);
150 iz = _mm_set1_ps(shZ+pos[ii3+2]);
152 iq = _mm_load1_ps(charge+ii);
153 iq = _mm_mul_ps(iq,fac_sse);
155 isai_f = invsqrta[ii];
156 isai = _mm_load1_ps(&isai_f);
158 nti = 2*ntype*type[ii];
160 vctot = _mm_setzero_ps();
161 Vvdwtot = _mm_setzero_ps();
162 dvdasum = _mm_setzero_ps();
163 vgbtot = _mm_setzero_ps();
164 fix = _mm_setzero_ps();
165 fiy = _mm_setzero_ps();
166 fiz = _mm_setzero_ps();
168 offset = (nj1-nj0)%4;
170 for(k=nj0;k<nj1-offset;k+=4)
182 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3)); /* x1 y1 - - */
183 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23)); /* x2 y2 - - */
184 xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (pos+j33)); /* x3 y3 - - */
185 xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (pos+j43)); /* x4 y4 - - */
187 xmm5 = _mm_load1_ps(pos+j3+2);
188 xmm6 = _mm_load1_ps(pos+j23+2);
189 xmm7 = _mm_load1_ps(pos+j33+2);
190 xmm8 = _mm_load1_ps(pos+j43+2);
193 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
194 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
195 jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
197 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
198 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
199 jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
200 jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
202 dx1 = _mm_sub_ps(ix,jx);
203 dy1 = _mm_sub_ps(iy,jy);
204 dz1 = _mm_sub_ps(iz,jz);
206 t1 = _mm_mul_ps(dx1,dx1);
207 t2 = _mm_mul_ps(dy1,dy1);
208 t3 = _mm_mul_ps(dz1,dz1);
210 rsq = _mm_add_ps(t1,t2);
211 rsq = _mm_add_ps(rsq,t3);
212 rinv = gmx_mm_invsqrt_ps(rsq);
214 xmm1 = _mm_load_ss(invsqrta+jnr);
215 xmm2 = _mm_load_ss(invsqrta+jnr2);
216 xmm3 = _mm_load_ss(invsqrta+jnr3);
217 xmm4 = _mm_load_ss(invsqrta+jnr4);
219 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
220 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
221 isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
223 isaprod = _mm_mul_ps(isai,isaj);
225 xmm1 = _mm_load_ss(charge+jnr);
226 xmm2 = _mm_load_ss(charge+jnr2);
227 xmm3 = _mm_load_ss(charge+jnr3);
228 xmm4 = _mm_load_ss(charge+jnr4);
230 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
231 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
232 q = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
233 qq = _mm_mul_ps(iq,q);
235 vcoul = _mm_mul_ps(qq,rinv);
236 fscal = _mm_mul_ps(vcoul,rinv);
237 qq = _mm_mul_ps(qq,neg);
238 qq = _mm_mul_ps(isaprod,qq);
239 gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
241 tj = nti+2*type[jnr];
242 tj2 = nti+2*type[jnr2];
243 tj3 = nti+2*type[jnr3];
244 tj4 = nti+2*type[jnr4];
246 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
247 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
248 xmm2 = _mm_loadl_pi(xmm2,(__m64 *) (vdwparam+tj3));
249 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (vdwparam+tj4));
251 xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2)); /* c61 c62 c61 c62 */
252 xmm4 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(0,2,0,2)); /* c63 c64 c63 c64 */
253 c6 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,1,0,1)); /* c61 c62 c63 c64 */
255 xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /* c121 c122 c121 c122 */
256 xmm4 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,3,1)); /* c123 c124 c123 c124 */
257 c12 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(1,0,1,0)); /* c121 c122 c123 c124 */
259 xmm1 = _mm_load_ss(dvda+jnr);
260 xmm2 = _mm_load_ss(dvda+jnr2);
261 xmm3 = _mm_load_ss(dvda+jnr3);
262 xmm4 = _mm_load_ss(dvda+jnr4);
264 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
265 xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
266 dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
268 /* Calculate GB table index */
269 r = _mm_mul_ps(rsq,rinv);
270 rt = _mm_mul_ps(r,gbscale);
272 n0 = _mm_cvttps_epi32(rt);
273 n0f = _mm_cvtepi32_ps(n0);
274 eps = _mm_sub_ps(rt,n0f);
276 eps2 = _mm_mul_ps(eps,eps);
277 nnn = _mm_slli_epi32(n0,2);
279 /* the tables are 16-byte aligned, so we can use _mm_load_ps */
280 xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
281 xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
282 xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
283 xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
286 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
287 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
288 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
289 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
291 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
292 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
293 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
294 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
296 Geps = _mm_mul_ps(G,eps);
297 Heps2 = _mm_mul_ps(H,eps2);
298 Fp = _mm_add_ps(F,Geps);
299 Fp = _mm_add_ps(Fp,Heps2);
301 VV = _mm_mul_ps(Fp,eps);
302 VV = _mm_add_ps(Y,VV);
304 xmm1 = _mm_mul_ps(two,Heps2);
305 FF = _mm_add_ps(Fp,Geps);
306 FF = _mm_add_ps(FF,xmm1);
308 vgb = _mm_mul_ps(qq,VV);
309 fijC = _mm_mul_ps(qq,FF);
310 fijC = _mm_mul_ps(fijC,gbscale);
312 dvdatmp = _mm_mul_ps(fijC,r);
313 dvdatmp = _mm_add_ps(vgb,dvdatmp);
314 dvdatmp = _mm_mul_ps(half,dvdatmp);
315 dvdatmp = _mm_mul_ps(neg,dvdatmp);
317 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
319 xmm1 = _mm_mul_ps(dvdatmp,isaj);
320 xmm1 = _mm_mul_ps(xmm1,isaj);
321 dvdaj = _mm_add_ps(dvdaj,xmm1);
323 _mm_store_ss(dvda+jnr,dvdaj);
324 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
325 _mm_store_ss(dvda+jnr2,xmm1);
326 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2));
327 _mm_store_ss(dvda+jnr3,xmm1);
328 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(2,1,0,3));
329 _mm_store_ss(dvda+jnr4,xmm1);
331 vctot = _mm_add_ps(vctot,vcoul);
332 vgbtot = _mm_add_ps(vgbtot,vgb);
334 /* Calculate VDW table index */
335 rt = _mm_mul_ps(r,tabscale_sse);
336 n0 = _mm_cvttps_epi32(rt);
337 n0f = _mm_cvtepi32_ps(n0);
338 eps = _mm_sub_ps(rt,n0f);
339 eps2 = _mm_mul_ps(eps,eps);
340 nnn = _mm_slli_epi32(n0,3);
342 /* Tabulated VdW interaction - disperion */
343 xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
344 xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
345 xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
346 xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
349 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
350 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
351 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
352 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
354 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
355 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
356 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
357 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
359 Geps = _mm_mul_ps(G,eps);
360 Heps2 = _mm_mul_ps(H,eps2);
361 Fp = _mm_add_ps(F,Geps);
362 Fp = _mm_add_ps(Fp,Heps2);
363 VV = _mm_mul_ps(Fp,eps);
364 VV = _mm_add_ps(Y,VV);
365 xmm1 = _mm_mul_ps(two,Heps2);
366 FF = _mm_add_ps(Fp,Geps);
367 FF = _mm_add_ps(FF,xmm1);
369 Vvdw6 = _mm_mul_ps(c6,VV);
370 fijD = _mm_mul_ps(c6,FF);
372 /* Tabulated VdW interaction - repulsion */
373 nnn = _mm_add_epi32(nnn,four);
375 xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
376 xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
377 xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
378 xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
381 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
382 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
383 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
384 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
386 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
387 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
388 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
389 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
391 Geps = _mm_mul_ps(G,eps);
392 Heps2 = _mm_mul_ps(H,eps2);
393 Fp = _mm_add_ps(F,Geps);
394 Fp = _mm_add_ps(Fp,Heps2);
395 VV = _mm_mul_ps(Fp,eps);
396 VV = _mm_add_ps(Y,VV);
397 xmm1 = _mm_mul_ps(two,Heps2);
398 FF = _mm_add_ps(Fp,Geps);
399 FF = _mm_add_ps(FF,xmm1);
401 Vvdw12 = _mm_mul_ps(c12,VV);
402 fijR = _mm_mul_ps(c12,FF);
404 Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
405 Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
407 xmm1 = _mm_add_ps(fijD,fijR);
408 xmm1 = _mm_mul_ps(xmm1,tabscale_sse);
409 xmm1 = _mm_add_ps(xmm1,fijC);
410 xmm1 = _mm_sub_ps(xmm1,fscal);
411 fscal = _mm_mul_ps(xmm1,neg);
412 fscal = _mm_mul_ps(fscal,rinv);
414 t1 = _mm_mul_ps(fscal,dx1);
415 t2 = _mm_mul_ps(fscal,dy1);
416 t3 = _mm_mul_ps(fscal,dz1);
418 fix = _mm_add_ps(fix,t1);
419 fiy = _mm_add_ps(fiy,t2);
420 fiz = _mm_add_ps(fiz,t3);
422 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3)); /* fx1 fy1 - - */
423 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); /* fx1 fy1 - - */
424 xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33)); /* fx3 fy3 - - */
425 xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (faction+j43)); /* fx4 fy4 - - */
427 xmm5 = _mm_load1_ps(faction+j3+2);
428 xmm6 = _mm_load1_ps(faction+j23+2);
429 xmm7 = _mm_load1_ps(faction+j33+2);
430 xmm8 = _mm_load1_ps(faction+j43+2);
432 /* transpose the forces */
433 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
434 xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
435 xmm7 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
437 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
438 xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
440 xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
441 xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
443 /* subtract partial terms */
444 xmm5 = _mm_sub_ps(xmm5,t1);
445 xmm6 = _mm_sub_ps(xmm6,t2);
446 xmm7 = _mm_sub_ps(xmm7,t3);
448 xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
449 xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
450 xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
451 xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
453 /* store fx's and fy's */
454 _mm_storel_pi( (__m64 *) (faction+j3),xmm1);
455 _mm_storeh_pi( (__m64 *) (faction+j23),xmm1);
456 _mm_storel_pi( (__m64 *) (faction+j33),xmm2);
457 _mm_storeh_pi( (__m64 *) (faction+j43),xmm2);
460 _mm_store_ss(faction+j3+2,xmm7);
461 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
462 _mm_store_ss(faction+j23+2,xmm7);
463 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
464 _mm_store_ss(faction+j33+2,xmm7);
465 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
466 _mm_store_ss(faction+j43+2,xmm7);
476 tj = nti+2*type[jnr];
478 xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (pos+j3));
479 xmm5 = _mm_load1_ps(pos+j3+2);
481 xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
482 xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
484 isaj = _mm_load_ss(invsqrta+jnr);
485 dvdaj = _mm_load_ss(dvda+jnr);
486 q = _mm_load_ss(charge+jnr);
487 c6 = _mm_load_ss(vdwparam+tj);
488 c12 = _mm_load_ss(vdwparam+tj+1);
490 mask = _mm_set_epi32(0,0,0,0xffffffff);
501 tj = nti+2*type[jnr];
502 tj2 = nti+2*type[jnr2];
504 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));
505 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23));
507 xmm5 = _mm_load1_ps(pos+j3+2);
508 xmm6 = _mm_load1_ps(pos+j23+2);
510 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
511 xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
513 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
514 xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
515 xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
517 xmm1 = _mm_load_ss(invsqrta+jnr);
518 xmm2 = _mm_load_ss(invsqrta+jnr2);
519 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
520 isaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
522 xmm1 = _mm_load_ss(dvda+jnr);
523 xmm2 = _mm_load_ss(dvda+jnr2);
524 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
525 dvdaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
527 xmm1 = _mm_load_ss(charge+jnr);
528 xmm2 = _mm_load_ss(charge+jnr2);
529 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
530 q = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
532 xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
533 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
535 c6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
536 c12 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
538 mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
550 tj = nti+2*type[jnr];
551 tj2 = nti+2*type[jnr2];
552 tj3 = nti+2*type[jnr3];
554 xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (pos+j3));
555 xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (pos+j23));
556 xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (pos+j33));
558 xmm5 = _mm_load1_ps(pos+j3+2);
559 xmm6 = _mm_load1_ps(pos+j23+2);
560 xmm7 = _mm_load1_ps(pos+j33+2);
562 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
563 xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
565 xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
566 xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
568 xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
569 xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
571 xmm1 = _mm_load_ss(invsqrta+jnr);
572 xmm2 = _mm_load_ss(invsqrta+jnr2);
573 xmm3 = _mm_load_ss(invsqrta+jnr3);
575 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
576 xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
577 isaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
579 xmm1 = _mm_load_ss(dvda+jnr);
580 xmm2 = _mm_load_ss(dvda+jnr2);
581 xmm3 = _mm_load_ss(dvda+jnr3);
583 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
584 xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
585 dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
587 xmm1 = _mm_load_ss(charge+jnr);
588 xmm2 = _mm_load_ss(charge+jnr2);
589 xmm3 = _mm_load_ss(charge+jnr3);
591 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
592 xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0));
593 q = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
595 xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (vdwparam+tj));
596 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (vdwparam+tj2));
597 xmm2 = _mm_loadl_pi(xmm2, (__m64 *) (vdwparam+tj3));
599 xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2));
600 c6 = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,0,0,1));
602 xmm3 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
603 c12 = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,1,1,0));
605 mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
608 jx = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
609 jy = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
610 jz = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
612 c6 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
613 c12 = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
614 dvdaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
615 isaj = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);
616 q = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
618 dx1 = _mm_sub_ps(ix,jx);
619 dy1 = _mm_sub_ps(iy,jy);
620 dz1 = _mm_sub_ps(iz,jz);
622 t1 = _mm_mul_ps(dx1,dx1);
623 t2 = _mm_mul_ps(dy1,dy1);
624 t3 = _mm_mul_ps(dz1,dz1);
626 rsq = _mm_add_ps(t1,t2);
627 rsq = _mm_add_ps(rsq,t3);
629 rinv = gmx_mm_invsqrt_ps(rsq);
631 isaprod = _mm_mul_ps(isai,isaj);
632 qq = _mm_mul_ps(iq,q);
633 vcoul = _mm_mul_ps(qq,rinv);
634 fscal = _mm_mul_ps(vcoul,rinv);
636 qq = _mm_mul_ps(qq,neg);
637 qq = _mm_mul_ps(isaprod,qq);
639 gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
640 rinvsq = _mm_mul_ps(rinv,rinv);
641 r = _mm_mul_ps(rsq,rinv);
642 rt = _mm_mul_ps(r,gbscale);
644 n0 = _mm_cvttps_epi32(rt);
645 n0f = _mm_cvtepi32_ps(n0);
646 eps = _mm_sub_ps(rt,n0f);
648 eps2 = _mm_mul_ps(eps,eps);
649 nnn = _mm_slli_epi32(n0,2);
651 /* the tables are 16-byte aligned, so we can use _mm_load_ps */
652 xmm1 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
653 xmm2 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
654 xmm3 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
655 xmm4 = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
658 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
659 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
660 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
661 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
663 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
664 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
665 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0));
666 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2));
668 Geps = _mm_mul_ps(G,eps);
669 Heps2 = _mm_mul_ps(H,eps2);
670 Fp = _mm_add_ps(F,Geps);
671 Fp = _mm_add_ps(Fp,Heps2);
672 VV = _mm_mul_ps(Fp,eps);
673 VV = _mm_add_ps(Y,VV);
675 xmm1 = _mm_mul_ps(two,Heps2);
676 FF = _mm_add_ps(Fp,Geps);
677 FF = _mm_add_ps(FF,xmm1);
678 vgb = _mm_mul_ps(qq,VV);
679 fijC = _mm_mul_ps(qq,FF);
680 fijC = _mm_mul_ps(fijC,gbscale);
682 dvdatmp = _mm_mul_ps(fijC,r);
683 dvdatmp = _mm_add_ps(vgb,dvdatmp);
684 dvdatmp = _mm_mul_ps(half,dvdatmp);
685 dvdatmp = _mm_mul_ps(neg,dvdatmp);
687 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
689 xmm1 = _mm_mul_ps(dvdatmp,isaj);
690 xmm1 = _mm_mul_ps(xmm1,isaj);
691 dvdaj = _mm_add_ps(dvdaj,xmm1);
693 vcoul = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
694 vgb = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
696 vctot = _mm_add_ps(vctot,vcoul);
697 vgbtot = _mm_add_ps(vgbtot,vgb);
699 /* Calculate VDW table index */
700 rt = _mm_mul_ps(r,tabscale_sse);
701 n0 = _mm_cvttps_epi32(rt);
702 n0f = _mm_cvtepi32_ps(n0);
703 eps = _mm_sub_ps(rt,n0f);
704 eps2 = _mm_mul_ps(eps,eps);
705 nnn = _mm_slli_epi32(n0,3);
707 /* Tabulated VdW interaction - disperion */
708 xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
709 xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
710 xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
711 xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
714 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
715 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
716 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
717 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
719 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
720 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
721 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
722 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
724 Geps = _mm_mul_ps(G,eps);
725 Heps2 = _mm_mul_ps(H,eps2);
726 Fp = _mm_add_ps(F,Geps);
727 Fp = _mm_add_ps(Fp,Heps2);
728 VV = _mm_mul_ps(Fp,eps);
729 VV = _mm_add_ps(Y,VV);
730 xmm1 = _mm_mul_ps(two,Heps2);
731 FF = _mm_add_ps(Fp,Geps);
732 FF = _mm_add_ps(FF,xmm1);
734 Vvdw6 = _mm_mul_ps(c6,VV);
735 fijD = _mm_mul_ps(c6,FF);
737 /* Tabulated VdW interaction - repulsion */
738 nnn = _mm_add_epi32(nnn,four);
740 xmm1 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0))); /* Y1,F1,G1,H1 */
741 xmm2 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1))); /* Y2,F2,G2,H2 */
742 xmm3 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2))); /* Y3,F3,G3,H3 */
743 xmm4 = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3))); /* Y4,F4,G4,H4 */
746 xmm5 = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
747 xmm6 = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
748 xmm7 = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
749 xmm8 = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
751 Y = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
752 F = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
753 G = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
754 H = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
756 Geps = _mm_mul_ps(G,eps);
757 Heps2 = _mm_mul_ps(H,eps2);
758 Fp = _mm_add_ps(F,Geps);
759 Fp = _mm_add_ps(Fp,Heps2);
760 VV = _mm_mul_ps(Fp,eps);
761 VV = _mm_add_ps(Y,VV);
762 xmm1 = _mm_mul_ps(two,Heps2);
763 FF = _mm_add_ps(Fp,Geps);
764 FF = _mm_add_ps(FF,xmm1);
766 Vvdw12 = _mm_mul_ps(c12,VV);
767 fijR = _mm_mul_ps(c12,FF);
769 Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
770 Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
772 xmm1 = _mm_add_ps(fijD,fijR);
773 xmm1 = _mm_mul_ps(xmm1,tabscale_sse);
774 xmm1 = _mm_add_ps(xmm1,fijC);
775 xmm1 = _mm_sub_ps(xmm1,fscal);
776 fscal = _mm_mul_ps(xmm1,neg);
777 fscal = _mm_mul_ps(fscal,rinv);
779 t1 = _mm_mul_ps(fscal,dx1);
780 t2 = _mm_mul_ps(fscal,dy1);
781 t3 = _mm_mul_ps(fscal,dz1);
785 _mm_store_ss(dvda+jnr,dvdaj);
787 xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (faction+j3));
788 xmm7 = _mm_load1_ps(faction+j3+2);
790 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
791 xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(1,1,1,1));
793 xmm5 = _mm_sub_ps(xmm5,t1);
794 xmm6 = _mm_sub_ps(xmm6,t2);
795 xmm7 = _mm_sub_ps(xmm7,t3);
797 _mm_store_ss(faction+j3 , xmm5);
798 _mm_store_ss(faction+j3+1,xmm6);
799 _mm_store_ss(faction+j3+2,xmm7);
803 _mm_store_ss(dvda+jnr,dvdaj);
804 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
805 _mm_store_ss(dvda+jnr2,xmm1);
807 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
808 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));
810 xmm5 = _mm_load1_ps(faction+j3+2);
811 xmm6 = _mm_load1_ps(faction+j23+2);
813 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
814 xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
816 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
817 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
818 xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
820 xmm5 = _mm_sub_ps(xmm5, t1);
821 xmm6 = _mm_sub_ps(xmm6, t2);
822 xmm7 = _mm_sub_ps(xmm7, t3);
824 xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
825 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
827 _mm_storel_pi( (__m64 *) (faction+j3), xmm5);
828 _mm_storeh_pi( (__m64 *) (faction+j23), xmm5);
830 _mm_store_ss(faction+j3+2,xmm7);
831 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
832 _mm_store_ss(faction+j23+2,xmm7);
836 _mm_store_ss(dvda+jnr,dvdaj);
837 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1));
838 _mm_store_ss(dvda+jnr2,xmm1);
839 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2));
840 _mm_store_ss(dvda+jnr3,xmm1);
842 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
843 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));
844 xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33));
846 xmm5 = _mm_load1_ps(faction+j3+2);
847 xmm6 = _mm_load1_ps(faction+j23+2);
848 xmm7 = _mm_load1_ps(faction+j33+2);
850 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
851 xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0));
852 xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0));
854 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
855 xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2));
857 xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
858 xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
860 xmm5 = _mm_sub_ps(xmm5, t1);
861 xmm6 = _mm_sub_ps(xmm6, t2);
862 xmm7 = _mm_sub_ps(xmm7, t3);
864 xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
865 xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
866 xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
867 xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
869 _mm_storel_pi( (__m64 *) (faction+j3), xmm1);
870 _mm_storeh_pi( (__m64 *) (faction+j23), xmm1);
871 _mm_storel_pi( (__m64 *) (faction+j33), xmm2);
873 _mm_store_ss(faction+j3+2,xmm7);
874 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
875 _mm_store_ss(faction+j23+2,xmm7);
876 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
877 _mm_store_ss(faction+j33+2,xmm7);
880 t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
881 t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
882 t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
884 fix = _mm_add_ps(fix,t1);
885 fiy = _mm_add_ps(fiy,t2);
886 fiz = _mm_add_ps(fiz,t3);
889 t1 = _mm_movehl_ps(t1,fix);
890 t2 = _mm_movehl_ps(t2,fiy);
891 t3 = _mm_movehl_ps(t3,fiz);
893 fix = _mm_add_ps(fix,t1);
894 fiy = _mm_add_ps(fiy,t2);
895 fiz = _mm_add_ps(fiz,t3);
897 t1 = _mm_shuffle_ps(fix,fix,_MM_SHUFFLE(1,1,1,1));
898 t2 = _mm_shuffle_ps(fiy,fiy,_MM_SHUFFLE(1,1,1,1));
899 t3 = _mm_shuffle_ps(fiz,fiz,_MM_SHUFFLE(1,1,1,1));
901 fix = _mm_add_ps(fix,t1);
902 fiy = _mm_add_ps(fiy,t2);
903 fiz = _mm_add_ps(fiz,t3);
905 xmm2 = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
906 xmm2 = _mm_movelh_ps(xmm2,fiz);
907 xmm2 = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
909 /* load i force from memory */
910 xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));
911 xmm5 = _mm_load1_ps(faction+ii3+2);
912 xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
915 xmm4 = _mm_add_ps(xmm4,xmm2);
917 /* store i force to memory */
918 _mm_storel_pi( (__m64 *) (faction+ii3),xmm4);
919 xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
920 _mm_store_ss(faction+ii3+2,xmm4);
922 /* Load, add and store i shift forces */
923 xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (fshift+is3));
924 xmm5 = _mm_load1_ps(fshift+is3+2);
925 xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
927 xmm4 = _mm_add_ps(xmm4,xmm2);
929 _mm_storel_pi( (__m64 *) (fshift+is3),xmm4);
930 xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
931 _mm_store_ss(fshift+is3+2,xmm4);
933 /* Coulomb potential */
936 vcoul = _mm_movehl_ps(vcoul,vctot);
937 vctot = _mm_add_ps(vctot,vcoul);
938 vcoul = _mm_shuffle_ps(vctot,vctot,_MM_SHUFFLE(1,1,1,1));
939 vctot = _mm_add_ss(vctot,vcoul);
941 _mm_store_ss(&vct,vctot);
942 Vc[ggid] = Vc[ggid] + vct;
944 Vvdwtmp = _mm_movehl_ps(Vvdwtmp,Vvdwtot);
945 Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
946 Vvdwtmp = _mm_shuffle_ps(Vvdwtot,Vvdwtot,_MM_SHUFFLE(1,1,1,1));
947 Vvdwtot = _mm_add_ss(Vvdwtot,Vvdwtmp);
949 _mm_store_ss(&vdwt,Vvdwtot);
950 Vvdw[ggid] = Vvdw[ggid] + vdwt;
953 dvdatmp = _mm_movehl_ps(dvdatmp,dvdasum);
954 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
955 dvdatmp = _mm_shuffle_ps(dvdasum,dvdasum,_MM_SHUFFLE(1,1,1,1));
956 dvdasum = _mm_add_ss(dvdasum,dvdatmp);
958 _mm_store_ss(&dva,dvdasum);
959 dvda[ii] = dvda[ii] + dva*isai_f*isai_f;
961 /* Store the GB potential to the work array */
962 vgb = _mm_movehl_ps(vgb,vgbtot);
963 vgbtot = _mm_add_ps(vgbtot,vgb);
964 vgb = _mm_shuffle_ps(vgbtot,vgbtot,_MM_SHUFFLE(1,1,1,1));
965 vgbtot = _mm_add_ss(vgbtot,vgb);
967 _mm_store_ss(&vgbt,vgbtot);
968 gpol[ggid] = gpol[ggid] + vgbt;
977 * Gromacs nonbonded kernel nb_kernel430nf
978 * Coulomb interaction: Generalized-Born
979 * VdW interaction: Tabulated
980 * water optimization: No
981 * Calculate forces: no
983 void nb_kernel430nf_x86_64_sse(
1007 float * p_gbtabscale,
1016 int nri,ntype,nthreads;
1017 float facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
1018 int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
1021 float qq,vcoul,vctot;
1024 float Vvdw6,Vvdwtot;
1026 float r,rt,eps,eps2;
1028 float Y,F,Geps,Heps2,Fp,VV;
1029 float isai,isaj,isaprod,gbscale;
1032 float dx11,dy11,dz11,rsq11,rinv11;
1037 nthreads = *p_nthreads;
1041 tabscale = *p_tabscale;
1042 gbtabscale = *p_gbtabscale;
1045 for(n=0; (n<nri); n++)
1048 shX = shiftvec[is3];
1049 shY = shiftvec[is3+1];
1050 shZ = shiftvec[is3+2];
1055 ix1 = shX + pos[ii3+0];
1056 iy1 = shY + pos[ii3+1];
1057 iz1 = shZ + pos[ii3+2];
1058 iq = facel*charge[ii];
1059 isai = invsqrta[ii];
1060 nti = 2*ntype*type[ii];
1064 for(k=nj0; (k<nj1); k++)
1074 rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
1075 rinv11 = gmx_invsqrt(rsq11);
1076 isaj = invsqrta[jnr];
1077 isaprod = isai*isaj;
1078 qq = iq*charge[jnr];
1081 gbscale = isaprod*gbtabscale;
1082 tj = nti+2*type[jnr];
1084 c12 = vdwparam[tj+1];
1093 Geps = eps*GBtab[nnn+2];
1094 Heps2 = eps2*GBtab[nnn+3];
1098 vctot = vctot + vcoul;
1107 Geps = eps*VFtab[nnn+2];
1108 Heps2 = eps2*VFtab[nnn+3];
1115 Geps = eps*VFtab[nnn+2];
1116 Heps2 = eps2*VFtab[nnn+3];
1120 Vvdwtot = Vvdwtot+ Vvdw6 + Vvdw12;
1124 Vc[ggid] = Vc[ggid] + vctot;
1125 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;