2 * Copyright (c) Erik Lindahl, David van der Spoel 2003
4 * This file is generated automatically at compile time
5 * by the program mknb in the Gromacs distribution.
7 * Options used when generation this file:
11 * Software invsqrt: yes
21 #include <emmintrin.h>
23 #include <gmx_sse2_single.h>
25 /* get gmx_gbdata_t */
26 #include "../nb_kerneltype.h"
29 void nb_kernel410_sse2_single(int * p_nri,
61 int nri,ntype,nthreads;
62 int n,ii,is3,ii3,k,nj0,nj1,ggid;
65 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
66 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
72 __m128 iq,qq,qqB,jq,jqB,isai;
78 __m128 vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
79 __m128 fix,fiy,fiz,tx,ty,tz,rsq;
80 __m128 fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
81 __m128 rinv,isaj,isaprod;
82 __m128 rinvB,isajB,isaprodB;
83 __m128 vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB,c6,c6B,c12,c12B;
85 __m128 rinvsqB,rB,rtabB;
87 __m128 epsB,YB,FB,GB,HB;
88 __m128 vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
89 __m128 rinvsix,vvdw6,vvdw12;
90 __m128 rinvsixB,vvdw6B,vvdw12B;
91 __m128 facel,gbtabscale,mask,dvdaj;
93 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
94 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
95 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
100 const __m128 neg = _mm_set1_ps(-1.0f);
101 const __m128 zero = _mm_set1_ps(0.0f);
102 const __m128 minushalf = _mm_set1_ps(-0.5f);
103 const __m128 two = _mm_set1_ps(2.0f);
104 const __m128 six = _mm_set1_ps(6.0f);
105 const __m128 twelve = _mm_set1_ps(12.0f);
107 gbdata = (gmx_gbdata_t *)work;
113 gbfactor = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));
114 gbtabscale = _mm_load1_ps(p_gbtabscale);
115 facel = _mm_load1_ps(p_facel);
118 jnrA = jnrB = jnrC = jnrD = 0;
119 j3A = j3B = j3C = j3D = 0;
120 jx = _mm_setzero_ps();
121 jy = _mm_setzero_ps();
122 jz = _mm_setzero_ps();
123 c6 = _mm_setzero_ps();
124 c12 = _mm_setzero_ps();
127 for(n=0; (n<nri); n++)
131 shY = shiftvec[is3+1];
132 shZ = shiftvec[is3+2];
138 ix = _mm_set1_ps(shX+pos[ii3+0]);
139 iy = _mm_set1_ps(shY+pos[ii3+1]);
140 iz = _mm_set1_ps(shZ+pos[ii3+2]);
142 iq = _mm_load1_ps(charge+ii);
143 iq = _mm_mul_ps(iq,facel);
145 isai = _mm_load1_ps(invsqrta+ii);
147 nti = 2*ntype*type[ii];
149 vctot = _mm_setzero_ps();
150 vvdwtot = _mm_setzero_ps();
151 vgbtot = _mm_setzero_ps();
152 dvdasum = _mm_setzero_ps();
153 fix = _mm_setzero_ps();
154 fiy = _mm_setzero_ps();
155 fiz = _mm_setzero_ps();
157 for(k=nj0; k<nj1-7; k+=8)
178 GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
179 GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
181 dx = _mm_sub_ps(ix,jx);
182 dy = _mm_sub_ps(iy,jy);
183 dz = _mm_sub_ps(iz,jz);
184 dxB = _mm_sub_ps(ix,jxB);
185 dyB = _mm_sub_ps(iy,jyB);
186 dzB = _mm_sub_ps(iz,jzB);
188 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
189 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
192 rinv = gmx_mm_invsqrt_ps(rsq);
193 rinvB = gmx_mm_invsqrt_ps(rsqB);
194 rinvsq = _mm_mul_ps(rinv,rinv);
195 rinvsqB = _mm_mul_ps(rinvB,rinvB);
197 /***********************************/
198 /* INTERACTION SECTION STARTS HERE */
199 /***********************************/
200 GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
201 GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
202 GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
203 GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
206 tjA = nti+2*type[jnrA];
207 tjB = nti+2*type[jnrB];
208 tjC = nti+2*type[jnrC];
209 tjD = nti+2*type[jnrD];
210 tjE = nti+2*type[jnrE];
211 tjF = nti+2*type[jnrF];
212 tjG = nti+2*type[jnrG];
213 tjH = nti+2*type[jnrH];
215 GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
216 GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjE,vdwparam+tjF,vdwparam+tjG,vdwparam+tjH,c6B,c12B);
218 isaprod = _mm_mul_ps(isai,isaj);
219 isaprodB = _mm_mul_ps(isai,isajB);
220 qq = _mm_mul_ps(iq,jq);
221 qqB = _mm_mul_ps(iq,jqB);
222 vcoul = _mm_mul_ps(qq,rinv);
223 vcoulB = _mm_mul_ps(qqB,rinvB);
224 fscal = _mm_mul_ps(vcoul,rinv);
225 fscalB = _mm_mul_ps(vcoulB,rinvB);
226 vctot = _mm_add_ps(vctot,vcoul);
227 vctot = _mm_add_ps(vctot,vcoulB);
229 /* Polarization interaction */
230 qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
231 qqB = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
232 gbscale = _mm_mul_ps(isaprod,gbtabscale);
233 gbscaleB = _mm_mul_ps(isaprodB,gbtabscale);
235 /* Calculate GB table index */
236 r = _mm_mul_ps(rsq,rinv);
237 rB = _mm_mul_ps(rsqB,rinvB);
238 rtab = _mm_mul_ps(r,gbscale);
239 rtabB = _mm_mul_ps(rB,gbscaleB);
241 n0 = _mm_cvttps_epi32(rtab);
242 n0B = _mm_cvttps_epi32(rtabB);
243 eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
244 epsB = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
245 nnn = _mm_slli_epi32(n0,2);
246 nnnB = _mm_slli_epi32(n0B,2);
248 /* the tables are 16-byte aligned, so we can use _mm_load_ps */
249 Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
250 F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
251 G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
252 H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
253 _MM_TRANSPOSE4_PS(Y,F,G,H);
254 YB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
255 FB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
256 GB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
257 HB = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
258 _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
260 G = _mm_mul_ps(G,eps);
261 H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
262 F = _mm_add_ps(F, _mm_add_ps( G , H ) );
263 Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
264 F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
265 vgb = _mm_mul_ps(Y, qq);
266 fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
268 GB = _mm_mul_ps(GB,epsB);
269 HB = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
270 FB = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
271 YB = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
272 FB = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
273 vgbB = _mm_mul_ps(YB, qqB);
274 fijGBB = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
277 dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
278 dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
280 vgbtot = _mm_add_ps(vgbtot, vgb);
281 vgbtot = _mm_add_ps(vgbtot, vgbB);
283 dvdasum = _mm_add_ps(dvdasum, dvdatmp);
284 dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
285 dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
286 dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
288 GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
289 GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
291 rinvsix = _mm_mul_ps(rinvsq,rinvsq);
292 rinvsixB = _mm_mul_ps(rinvsqB,rinvsqB);
293 rinvsix = _mm_mul_ps(rinvsix,rinvsq);
294 rinvsixB = _mm_mul_ps(rinvsixB,rinvsqB);
296 vvdw6 = _mm_mul_ps(c6,rinvsix);
297 vvdw6B = _mm_mul_ps(c6B,rinvsixB);
298 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
299 vvdw12B = _mm_mul_ps(c12B, _mm_mul_ps(rinvsixB,rinvsixB));
300 vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
301 vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12B,vvdw6B));
303 fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
304 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
305 _mm_mul_ps(six,vvdw6))),
306 _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
308 fscalB = _mm_sub_ps(_mm_mul_ps(rinvsqB,
309 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12B),
310 _mm_mul_ps(six,vvdw6B))),
311 _mm_mul_ps( _mm_sub_ps( fijGBB,fscalB),rinvB ));
313 /***********************************/
314 /* INTERACTION SECTION ENDS HERE */
315 /***********************************/
317 /* Calculate temporary vectorial force */
318 tx = _mm_mul_ps(fscal,dx);
319 ty = _mm_mul_ps(fscal,dy);
320 tz = _mm_mul_ps(fscal,dz);
321 txB = _mm_mul_ps(fscalB,dxB);
322 tyB = _mm_mul_ps(fscalB,dyB);
323 tzB = _mm_mul_ps(fscalB,dzB);
325 /* Increment i atom force */
326 fix = _mm_add_ps(fix,tx);
327 fiy = _mm_add_ps(fiy,ty);
328 fiz = _mm_add_ps(fiz,tz);
329 fix = _mm_add_ps(fix,txB);
330 fiy = _mm_add_ps(fiy,tyB);
331 fiz = _mm_add_ps(fiz,tzB);
333 /* Store j forces back */
334 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
335 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
350 GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
352 dx = _mm_sub_ps(ix,jx);
353 dy = _mm_sub_ps(iy,jy);
354 dz = _mm_sub_ps(iz,jz);
356 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
359 rinv = gmx_mm_invsqrt_ps(rsq);
360 rinvsq = _mm_mul_ps(rinv,rinv);
362 /***********************************/
363 /* INTERACTION SECTION STARTS HERE */
364 /***********************************/
365 GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
366 GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
369 tjA = nti+2*type[jnrA];
370 tjB = nti+2*type[jnrB];
371 tjC = nti+2*type[jnrC];
372 tjD = nti+2*type[jnrD];
374 GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
376 isaprod = _mm_mul_ps(isai,isaj);
377 qq = _mm_mul_ps(iq,jq);
378 vcoul = _mm_mul_ps(qq,rinv);
379 fscal = _mm_mul_ps(vcoul,rinv);
380 vctot = _mm_add_ps(vctot,vcoul);
382 /* Polarization interaction */
383 qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
384 gbscale = _mm_mul_ps(isaprod,gbtabscale);
386 /* Calculate GB table index */
387 r = _mm_mul_ps(rsq,rinv);
388 rtab = _mm_mul_ps(r,gbscale);
390 n0 = _mm_cvttps_epi32(rtab);
391 eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
392 nnn = _mm_slli_epi32(n0,2);
394 /* the tables are 16-byte aligned, so we can use _mm_load_ps */
395 Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
396 F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
397 G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
398 H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
399 _MM_TRANSPOSE4_PS(Y,F,G,H);
401 G = _mm_mul_ps(G,eps);
402 H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
403 F = _mm_add_ps(F, _mm_add_ps( G , H ) );
404 Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
405 F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
406 vgb = _mm_mul_ps(Y, qq);
407 fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
410 dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
412 vgbtot = _mm_add_ps(vgbtot, vgb);
414 dvdasum = _mm_add_ps(dvdasum, dvdatmp);
415 dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
417 GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
420 rinvsix = _mm_mul_ps(rinvsq,rinvsq);
421 rinvsix = _mm_mul_ps(rinvsix,rinvsq);
423 vvdw6 = _mm_mul_ps(c6,rinvsix);
424 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
425 vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
427 fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
428 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
429 _mm_mul_ps(six,vvdw6))),
430 _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
432 /***********************************/
433 /* INTERACTION SECTION ENDS HERE */
434 /***********************************/
436 /* Calculate temporary vectorial force */
437 tx = _mm_mul_ps(fscal,dx);
438 ty = _mm_mul_ps(fscal,dy);
439 tz = _mm_mul_ps(fscal,dz);
441 /* Increment i atom force */
442 fix = _mm_add_ps(fix,tx);
443 fiy = _mm_add_ps(fiy,ty);
444 fiz = _mm_add_ps(fiz,tz);
446 /* Store j forces back */
447 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
450 offset = (nj1-nj0)%4;
454 /* Epilogue loop to take care of non-4-multiple j particles */
459 tjA = nti+2*type[jnrA];
460 GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
461 GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
462 GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
463 GMX_MM_LOAD_1PAIR_PS(vdwparam+tjA,c6,c12);
472 tjA = nti+2*type[jnrA];
473 tjB = nti+2*type[jnrB];
474 GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
475 GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
476 GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
477 GMX_MM_LOAD_2PAIRS_PS(vdwparam+tjA,vdwparam+tjB,c6,c12);
482 /* offset must be 3 */
489 tjA = nti+2*type[jnrA];
490 tjB = nti+2*type[jnrB];
491 tjC = nti+2*type[jnrC];
492 GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
493 GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
494 GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
495 GMX_MM_LOAD_3PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,c6,c12);
499 dx = _mm_sub_ps(ix,jx);
500 dy = _mm_sub_ps(iy,jy);
501 dz = _mm_sub_ps(iz,jz);
503 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
506 rinv = gmx_mm_invsqrt_ps(rsq);
507 rinv = _mm_and_ps(rinv,mask);
509 rinvsq = _mm_mul_ps(rinv,rinv);
511 /***********************************/
512 /* INTERACTION SECTION STARTS HERE */
513 /***********************************/
514 isaprod = _mm_mul_ps(isai,isaj);
515 qq = _mm_and_ps(_mm_mul_ps(iq,jq),mask);
516 vcoul = _mm_mul_ps(qq,rinv);
517 fscal = _mm_mul_ps(vcoul,rinv);
518 vctot = _mm_add_ps(vctot,vcoul);
520 /* Polarization interaction */
521 qq = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
522 gbscale = _mm_mul_ps(isaprod,gbtabscale);
524 /* Calculate GB table index */
525 r = _mm_mul_ps(rsq,rinv);
526 rtab = _mm_mul_ps(r,gbscale);
528 n0 = _mm_cvttps_epi32(rtab);
529 eps = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
530 nnn = _mm_slli_epi32(n0,2);
532 /* the tables are 16-byte aligned, so we can use _mm_load_ps */
533 Y = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
534 F = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
535 G = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
536 H = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
537 _MM_TRANSPOSE4_PS(Y,F,G,H);
539 G = _mm_mul_ps(G,eps);
540 H = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
541 F = _mm_add_ps(F, _mm_add_ps( G , H ) );
542 Y = _mm_add_ps(Y, _mm_mul_ps(F, eps));
543 F = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
544 vgb = _mm_mul_ps(Y, qq);
545 fijGB = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
547 dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
548 vgbtot = _mm_add_ps(vgbtot, vgb);
550 dvdasum = _mm_add_ps(dvdasum, dvdatmp);
551 dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
555 rinvsix = _mm_mul_ps(rinvsq,rinvsq);
556 rinvsix = _mm_mul_ps(rinvsix,rinvsq);
558 vvdw6 = _mm_mul_ps(c6,rinvsix);
559 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
560 vvdwtot = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
562 fscal = _mm_sub_ps(_mm_mul_ps(rinvsq,
563 _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
564 _mm_mul_ps(six,vvdw6))),
565 _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
567 /***********************************/
568 /* INTERACTION SECTION ENDS HERE */
569 /***********************************/
571 /* Calculate temporary vectorial force */
572 tx = _mm_mul_ps(fscal,dx);
573 ty = _mm_mul_ps(fscal,dy);
574 tz = _mm_mul_ps(fscal,dz);
576 /* Increment i atom force */
577 fix = _mm_add_ps(fix,tx);
578 fiy = _mm_add_ps(fiy,ty);
579 fiz = _mm_add_ps(fiz,tz);
583 GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
584 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
588 GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
589 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
593 /* offset must be 3 */
594 GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
595 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
599 dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
600 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
603 GMX_MM_UPDATE_4POT_PS(vctot,vc+ggid,vvdwtot,vvdw+ggid,vgbtot,gpol+ggid,dvdasum,dvda+ii);
613 * Gromacs nonbonded kernel nb_kernel410nf
614 * Coulomb interaction: Generalized-Born
615 * VdW interaction: Lennard-Jones
616 * water optimization: No
617 * Calculate forces: no
619 void nb_kernel410nf_x86_64_sse(
643 float * p_gbtabscale,
652 int nri,ntype,nthreads;
653 float facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
654 int n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
658 float qq,vcoul,vctot;
666 float Y,F,Geps,Heps2,Fp,VV;
667 float isai,isaj,isaprod,gbscale;
670 float dx11,dy11,dz11,rsq11,rinv11;
672 const int fractshift = 12;
673 const int fractmask = 8388607;
674 const int expshift = 23;
675 const int expmask = 2139095040;
676 const int explsb = 8388608;
679 union { unsigned int bval; float fval; } bitpattern,result;
683 nthreads = *p_nthreads;
687 tabscale = *p_tabscale;
688 gbtabscale = *p_gbtabscale;
691 for(n=0; (n<nri); n++)
695 shY = shiftvec[is3+1];
696 shZ = shiftvec[is3+2];
701 ix1 = shX + pos[ii3+0];
702 iy1 = shY + pos[ii3+1];
703 iz1 = shZ + pos[ii3+2];
704 iq = facel*charge[ii];
706 nti = 2*ntype*type[ii];
710 for(k=nj0; (k<nj1); k++)
720 rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
721 bitpattern.fval = rsq11;
722 iexp = (((bitpattern.bval)&expmask)>>expshift);
723 addr = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
724 result.bval = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
726 rinv11 = (0.5*lu*(3.0-((rsq11*lu)*lu)));
727 isaj = invsqrta[jnr];
732 gbscale = isaprod*gbtabscale;
733 tj = nti+2*type[jnr];
735 c12 = vdwparam[tj+1];
736 rinvsq = rinv11*rinv11;
745 Geps = eps*GBtab[nnn+2];
746 Heps2 = eps2*GBtab[nnn+3];
750 vctot = vctot + vcoul;
751 rinvsix = rinvsq*rinvsq*rinvsq;
753 vvdw12 = c12*rinvsix*rinvsix;
754 vvdwtot = vvdwtot+vvdw12-vvdw6;
758 Vc[ggid] = Vc[ggid] + vctot;
759 vvdw[ggid] = vvdw[ggid] + vvdwtot;