3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
64 /* Only compile this file if SSE intrinsics are available */
65 #if 0 && defined (GMX_X86_SSE2)
67 #include <gmx_sse2_single.h>
68 #include <emmintrin.h>
70 #include "genborn_sse2_single.h"
74 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
75 int natoms, gmx_localtop_t *top,
76 const t_atomtypes *atype, float *x, t_nblist *nl,
79 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
80 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
81 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
102 __m128 rsq,rinv,rinv2,rinv4,rinv6;
103 __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
104 __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
105 __m128 ratioB,rajB,vajB,rvdwB;
106 __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
107 __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
108 __m128 mask,icf4,icf6,mask_cmp;
109 __m128 icf4B,icf6B,mask_cmpB;
111 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
112 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
113 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
115 const __m128 half = _mm_set1_ps(0.5f);
116 const __m128 three = _mm_set1_ps(3.0f);
117 const __m128 one = _mm_set1_ps(1.0f);
118 const __m128 two = _mm_set1_ps(2.0f);
119 const __m128 zero = _mm_set1_ps(0.0f);
120 const __m128 four = _mm_set1_ps(4.0f);
122 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
123 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
124 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
126 factor = 0.5 * ONE_4PI_EPS0;
128 gb_radius = born->gb_radius;
130 work = born->gpol_still_work;
132 shiftvec = fr->shift_vec[0];
135 jnrA = jnrB = jnrC = jnrD = 0;
136 jx = _mm_setzero_ps();
137 jy = _mm_setzero_ps();
138 jz = _mm_setzero_ps();
142 for(i=0;i<natoms;i++)
147 for(i=0;i<nl->nri;i++)
151 is3 = 3*nl->shift[i];
153 shY = shiftvec[is3+1];
154 shZ = shiftvec[is3+2];
156 nj1 = nl->jindex[i+1];
158 ix = _mm_set1_ps(shX+x[ii3+0]);
159 iy = _mm_set1_ps(shY+x[ii3+1]);
160 iz = _mm_set1_ps(shZ+x[ii3+2]);
162 offset = (nj1-nj0)%4;
164 /* Polarization energy for atom ai */
165 gpi = _mm_setzero_ps();
167 rai = _mm_load1_ps(gb_radius+ii);
168 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
170 for(k=nj0;k<nj1-4-offset;k+=8)
190 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
191 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
193 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
194 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
195 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
196 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
198 dx = _mm_sub_ps(ix,jx);
199 dy = _mm_sub_ps(iy,jy);
200 dz = _mm_sub_ps(iz,jz);
201 dxB = _mm_sub_ps(ix,jxB);
202 dyB = _mm_sub_ps(iy,jyB);
203 dzB = _mm_sub_ps(iz,jzB);
205 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
206 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
207 rinv = gmx_mm_invsqrt_ps(rsq);
208 rinvB = gmx_mm_invsqrt_ps(rsqB);
209 rinv2 = _mm_mul_ps(rinv,rinv);
210 rinv2B = _mm_mul_ps(rinvB,rinvB);
211 rinv4 = _mm_mul_ps(rinv2,rinv2);
212 rinv4B = _mm_mul_ps(rinv2B,rinv2B);
213 rinv6 = _mm_mul_ps(rinv4,rinv2);
214 rinv6B = _mm_mul_ps(rinv4B,rinv2B);
216 rvdw = _mm_add_ps(rai,raj);
217 rvdwB = _mm_add_ps(rai,rajB);
218 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
219 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
221 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
222 mask_cmpB = _mm_cmple_ps(ratioB,still_p5inv);
224 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
225 if( 0 == _mm_movemask_ps(mask_cmp) )
227 /* if ratio>still_p5inv for ALL elements */
229 dccf = _mm_setzero_ps();
233 ratio = _mm_min_ps(ratio,still_p5inv);
234 theta = _mm_mul_ps(ratio,still_pip5);
235 gmx_mm_sincos_ps(theta,&sinq,&cosq);
236 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
237 ccf = _mm_mul_ps(term,term);
238 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
239 _mm_mul_ps(sinq,theta));
241 if( 0 == _mm_movemask_ps(mask_cmpB) )
243 /* if ratio>still_p5inv for ALL elements */
245 dccfB = _mm_setzero_ps();
249 ratioB = _mm_min_ps(ratioB,still_p5inv);
250 thetaB = _mm_mul_ps(ratioB,still_pip5);
251 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
252 termB = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
253 ccfB = _mm_mul_ps(termB,termB);
254 dccfB = _mm_mul_ps(_mm_mul_ps(two,termB),
255 _mm_mul_ps(sinqB,thetaB));
258 prod = _mm_mul_ps(still_p4,vaj);
259 prodB = _mm_mul_ps(still_p4,vajB);
260 icf4 = _mm_mul_ps(ccf,rinv4);
261 icf4B = _mm_mul_ps(ccfB,rinv4B);
262 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
263 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
265 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
266 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
268 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
270 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
272 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
274 _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
276 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
280 for(;k<nj1-offset;k+=4)
292 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
294 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
295 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
297 dx = _mm_sub_ps(ix,jx);
298 dy = _mm_sub_ps(iy,jy);
299 dz = _mm_sub_ps(iz,jz);
301 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
302 rinv = gmx_mm_invsqrt_ps(rsq);
303 rinv2 = _mm_mul_ps(rinv,rinv);
304 rinv4 = _mm_mul_ps(rinv2,rinv2);
305 rinv6 = _mm_mul_ps(rinv4,rinv2);
307 rvdw = _mm_add_ps(rai,raj);
308 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
310 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
312 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
313 if(0 == _mm_movemask_ps(mask_cmp))
315 /* if ratio>still_p5inv for ALL elements */
317 dccf = _mm_setzero_ps();
321 ratio = _mm_min_ps(ratio,still_p5inv);
322 theta = _mm_mul_ps(ratio,still_pip5);
323 gmx_mm_sincos_ps(theta,&sinq,&cosq);
324 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
325 ccf = _mm_mul_ps(term,term);
326 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
327 _mm_mul_ps(sinq,theta));
330 prod = _mm_mul_ps(still_p4,vaj);
331 icf4 = _mm_mul_ps(ccf,rinv4);
332 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
334 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
336 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
338 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
340 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
350 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
351 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
352 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
361 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
362 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
363 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
368 /* offset must be 3 */
375 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
376 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
377 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
381 dx = _mm_sub_ps(ix,jx);
382 dy = _mm_sub_ps(iy,jy);
383 dz = _mm_sub_ps(iz,jz);
385 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
386 rinv = gmx_mm_invsqrt_ps(rsq);
387 rinv2 = _mm_mul_ps(rinv,rinv);
388 rinv4 = _mm_mul_ps(rinv2,rinv2);
389 rinv6 = _mm_mul_ps(rinv4,rinv2);
391 rvdw = _mm_add_ps(rai,raj);
392 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
394 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
396 if(0 == _mm_movemask_ps(mask_cmp))
398 /* if ratio>still_p5inv for ALL elements */
400 dccf = _mm_setzero_ps();
404 ratio = _mm_min_ps(ratio,still_p5inv);
405 theta = _mm_mul_ps(ratio,still_pip5);
406 gmx_mm_sincos_ps(theta,&sinq,&cosq);
407 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
408 ccf = _mm_mul_ps(term,term);
409 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
410 _mm_mul_ps(sinq,theta));
413 prod = _mm_mul_ps(still_p4,vaj);
414 icf4 = _mm_mul_ps(ccf,rinv4);
415 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
417 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
419 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
421 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
424 tmp = _mm_mul_ps(prod_ai,icf4);
428 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
432 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
436 /* offset must be 3 */
437 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
440 GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
443 /* Sum up the polarization energy from other nodes */
446 gmx_sum(natoms, work, cr);
448 else if(DOMAINDECOMP(cr))
450 dd_atom_sum_real(cr->dd, work);
453 /* Compute the radii */
454 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
456 if(born->use[i] != 0)
458 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
459 gpi2 = gpi_ai * gpi_ai;
460 born->bRad[i] = factor*gmx_invsqrt(gpi2);
461 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
465 /* Extra (local) communication required for DD */
468 dd_atom_spread_real(cr->dd, born->bRad);
469 dd_atom_spread_real(cr->dd, fr->invsqrta);
477 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
478 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
480 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
481 int jnrA,jnrB,jnrC,jnrD;
483 int jnrE,jnrF,jnrG,jnrH;
486 float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
487 float sum_ai2, sum_ai3,tsum,tchain,doffset;
496 __m128 ix,iy,iz,jx,jy,jz;
497 __m128 dx,dy,dz,t1,t2,t3,t4;
499 __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
500 __m128 uij,lij2,uij2,lij3,uij3,diff2;
501 __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
502 __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
506 __m128 obc_mask1,obc_mask2,obc_mask3;
507 __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
508 __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
509 __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
510 __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
511 __m128 lij_invB,sk2_invB,prodB;
512 __m128 sk_ajB,sk2_ajB,sk2_rinvB;
513 __m128 dadx1B,dadx2B;
515 __m128 obc_mask1B,obc_mask2B,obc_mask3B;
517 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
518 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
519 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
521 __m128 oneeighth = _mm_set1_ps(0.125);
522 __m128 onefourth = _mm_set1_ps(0.25);
524 const __m128 half = _mm_set1_ps(0.5f);
525 const __m128 three = _mm_set1_ps(3.0f);
526 const __m128 one = _mm_set1_ps(1.0f);
527 const __m128 two = _mm_set1_ps(2.0f);
528 const __m128 zero = _mm_set1_ps(0.0f);
529 const __m128 neg = _mm_set1_ps(-1.0f);
531 /* Set the dielectric offset */
532 doffset = born->gb_doffset;
533 gb_radius = born->gb_radius;
534 obc_param = born->param;
535 work = born->gpol_hct_work;
538 shiftvec = fr->shift_vec[0];
540 jx = _mm_setzero_ps();
541 jy = _mm_setzero_ps();
542 jz = _mm_setzero_ps();
544 jnrA = jnrB = jnrC = jnrD = 0;
546 for(i=0;i<born->nr;i++)
551 for(i=0;i<nl->nri;i++)
555 is3 = 3*nl->shift[i];
557 shY = shiftvec[is3+1];
558 shZ = shiftvec[is3+2];
560 nj1 = nl->jindex[i+1];
562 ix = _mm_set1_ps(shX+x[ii3+0]);
563 iy = _mm_set1_ps(shY+x[ii3+1]);
564 iz = _mm_set1_ps(shZ+x[ii3+2]);
566 offset = (nj1-nj0)%4;
568 rai = _mm_load1_ps(gb_radius+ii);
569 rai_inv= gmx_mm_inv_ps(rai);
571 sum_ai = _mm_setzero_ps();
573 sk_ai = _mm_load1_ps(born->param+ii);
574 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
576 for(k=nj0;k<nj1-4-offset;k+=8)
596 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
597 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
598 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
599 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
600 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
601 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
603 dx = _mm_sub_ps(ix, jx);
604 dy = _mm_sub_ps(iy, jy);
605 dz = _mm_sub_ps(iz, jz);
606 dxB = _mm_sub_ps(ix, jxB);
607 dyB = _mm_sub_ps(iy, jyB);
608 dzB = _mm_sub_ps(iz, jzB);
610 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
611 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
613 rinv = gmx_mm_invsqrt_ps(rsq);
614 r = _mm_mul_ps(rsq,rinv);
615 rinvB = gmx_mm_invsqrt_ps(rsqB);
616 rB = _mm_mul_ps(rsqB,rinvB);
618 /* Compute raj_inv aj1-4 */
619 raj_inv = gmx_mm_inv_ps(raj);
620 raj_invB = gmx_mm_inv_ps(rajB);
622 /* Evaluate influence of atom aj -> ai */
623 t1 = _mm_add_ps(r,sk_aj);
624 t2 = _mm_sub_ps(r,sk_aj);
625 t3 = _mm_sub_ps(sk_aj,r);
626 t1B = _mm_add_ps(rB,sk_ajB);
627 t2B = _mm_sub_ps(rB,sk_ajB);
628 t3B = _mm_sub_ps(sk_ajB,rB);
629 obc_mask1 = _mm_cmplt_ps(rai, t1);
630 obc_mask2 = _mm_cmplt_ps(rai, t2);
631 obc_mask3 = _mm_cmplt_ps(rai, t3);
632 obc_mask1B = _mm_cmplt_ps(rai, t1B);
633 obc_mask2B = _mm_cmplt_ps(rai, t2B);
634 obc_mask3B = _mm_cmplt_ps(rai, t3B);
636 uij = gmx_mm_inv_ps(t1);
637 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
638 _mm_andnot_ps(obc_mask2,rai_inv));
639 dlij = _mm_and_ps(one,obc_mask2);
640 uij2 = _mm_mul_ps(uij, uij);
641 uij3 = _mm_mul_ps(uij2,uij);
642 lij2 = _mm_mul_ps(lij, lij);
643 lij3 = _mm_mul_ps(lij2,lij);
645 uijB = gmx_mm_inv_ps(t1B);
646 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
647 _mm_andnot_ps(obc_mask2B,rai_inv));
648 dlijB = _mm_and_ps(one,obc_mask2B);
649 uij2B = _mm_mul_ps(uijB, uijB);
650 uij3B = _mm_mul_ps(uij2B,uijB);
651 lij2B = _mm_mul_ps(lijB, lijB);
652 lij3B = _mm_mul_ps(lij2B,lijB);
654 diff2 = _mm_sub_ps(uij2,lij2);
655 lij_inv = gmx_mm_invsqrt_ps(lij2);
656 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
657 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
658 prod = _mm_mul_ps(onefourth,sk2_rinv);
660 diff2B = _mm_sub_ps(uij2B,lij2B);
661 lij_invB = gmx_mm_invsqrt_ps(lij2B);
662 sk2_ajB = _mm_mul_ps(sk_ajB,sk_ajB);
663 sk2_rinvB = _mm_mul_ps(sk2_ajB,rinvB);
664 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
666 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
667 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
669 t1 = _mm_sub_ps(lij,uij);
670 t2 = _mm_mul_ps(diff2,
671 _mm_sub_ps(_mm_mul_ps(onefourth,r),
673 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
674 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
675 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
676 t4 = _mm_and_ps(t4,obc_mask3);
677 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
679 t1B = _mm_sub_ps(lijB,uijB);
680 t2B = _mm_mul_ps(diff2B,
681 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
683 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
684 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
685 t4B = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
686 t4B = _mm_and_ps(t4B,obc_mask3B);
687 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
689 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
691 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
692 _mm_mul_ps(prod,lij3));
694 _mm_mul_ps(onefourth,
695 _mm_add_ps(_mm_mul_ps(lij,rinv),
696 _mm_mul_ps(lij3,r))));
697 t2 = _mm_mul_ps(onefourth,
698 _mm_add_ps(_mm_mul_ps(uij,rinv),
699 _mm_mul_ps(uij3,r)));
701 _mm_add_ps(_mm_mul_ps(half,uij2),
702 _mm_mul_ps(prod,uij3)));
703 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
704 _mm_mul_ps(rinv,rinv));
706 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
708 _mm_mul_ps(sk2_rinv,rinv))));
709 t1 = _mm_mul_ps(rinv,
710 _mm_add_ps(_mm_mul_ps(dlij,t1),
715 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
716 _mm_mul_ps(prodB,lij3B));
717 t1B = _mm_sub_ps(t1B,
718 _mm_mul_ps(onefourth,
719 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
720 _mm_mul_ps(lij3B,rB))));
721 t2B = _mm_mul_ps(onefourth,
722 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
723 _mm_mul_ps(uij3B,rB)));
724 t2B = _mm_sub_ps(t2B,
725 _mm_add_ps(_mm_mul_ps(half,uij2B),
726 _mm_mul_ps(prodB,uij3B)));
727 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
728 _mm_mul_ps(rinvB,rinvB));
729 t3B = _mm_sub_ps(t3B,
730 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
732 _mm_mul_ps(sk2_rinvB,rinvB))));
733 t1B = _mm_mul_ps(rinvB,
734 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
735 _mm_add_ps(t2B,t3B)));
737 dadx1 = _mm_and_ps(t1,obc_mask1);
738 dadx1B = _mm_and_ps(t1B,obc_mask1B);
741 /* Evaluate influence of atom ai -> aj */
742 t1 = _mm_add_ps(r,sk_ai);
743 t2 = _mm_sub_ps(r,sk_ai);
744 t3 = _mm_sub_ps(sk_ai,r);
745 t1B = _mm_add_ps(rB,sk_ai);
746 t2B = _mm_sub_ps(rB,sk_ai);
747 t3B = _mm_sub_ps(sk_ai,rB);
748 obc_mask1 = _mm_cmplt_ps(raj, t1);
749 obc_mask2 = _mm_cmplt_ps(raj, t2);
750 obc_mask3 = _mm_cmplt_ps(raj, t3);
751 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
752 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
753 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
755 uij = gmx_mm_inv_ps(t1);
756 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
757 _mm_andnot_ps(obc_mask2,raj_inv));
758 dlij = _mm_and_ps(one,obc_mask2);
759 uij2 = _mm_mul_ps(uij, uij);
760 uij3 = _mm_mul_ps(uij2,uij);
761 lij2 = _mm_mul_ps(lij, lij);
762 lij3 = _mm_mul_ps(lij2,lij);
764 uijB = gmx_mm_inv_ps(t1B);
765 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
766 _mm_andnot_ps(obc_mask2B,raj_invB));
767 dlijB = _mm_and_ps(one,obc_mask2B);
768 uij2B = _mm_mul_ps(uijB, uijB);
769 uij3B = _mm_mul_ps(uij2B,uijB);
770 lij2B = _mm_mul_ps(lijB, lijB);
771 lij3B = _mm_mul_ps(lij2B,lijB);
773 diff2 = _mm_sub_ps(uij2,lij2);
774 lij_inv = gmx_mm_invsqrt_ps(lij2);
775 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
776 prod = _mm_mul_ps(onefourth,sk2_rinv);
778 diff2B = _mm_sub_ps(uij2B,lij2B);
779 lij_invB = gmx_mm_invsqrt_ps(lij2B);
780 sk2_rinvB = _mm_mul_ps(sk2_ai,rinvB);
781 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
783 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
784 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
786 t1 = _mm_sub_ps(lij,uij);
787 t2 = _mm_mul_ps(diff2,
788 _mm_sub_ps(_mm_mul_ps(onefourth,r),
790 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
791 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
792 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
793 t4 = _mm_and_ps(t4,obc_mask3);
794 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
796 t1B = _mm_sub_ps(lijB,uijB);
797 t2B = _mm_mul_ps(diff2B,
798 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
800 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
801 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
802 t4B = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
803 t4B = _mm_and_ps(t4B,obc_mask3B);
804 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
806 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
807 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
809 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
810 _mm_mul_ps(prod,lij3));
812 _mm_mul_ps(onefourth,
813 _mm_add_ps(_mm_mul_ps(lij,rinv),
814 _mm_mul_ps(lij3,r))));
815 t2 = _mm_mul_ps(onefourth,
816 _mm_add_ps(_mm_mul_ps(uij,rinv),
817 _mm_mul_ps(uij3,r)));
819 _mm_add_ps(_mm_mul_ps(half,uij2),
820 _mm_mul_ps(prod,uij3)));
821 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
822 _mm_mul_ps(rinv,rinv));
824 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
826 _mm_mul_ps(sk2_rinv,rinv))));
827 t1 = _mm_mul_ps(rinv,
828 _mm_add_ps(_mm_mul_ps(dlij,t1),
832 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
833 _mm_mul_ps(prodB,lij3B));
834 t1B = _mm_sub_ps(t1B,
835 _mm_mul_ps(onefourth,
836 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
837 _mm_mul_ps(lij3B,rB))));
838 t2B = _mm_mul_ps(onefourth,
839 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
840 _mm_mul_ps(uij3B,rB)));
841 t2B = _mm_sub_ps(t2B,
842 _mm_add_ps(_mm_mul_ps(half,uij2B),
843 _mm_mul_ps(prodB,uij3B)));
844 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
845 _mm_mul_ps(rinvB,rinvB));
846 t3B = _mm_sub_ps(t3B,
847 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
849 _mm_mul_ps(sk2_rinvB,rinvB))));
850 t1B = _mm_mul_ps(rinvB,
851 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
852 _mm_add_ps(t2B,t3B)));
855 dadx2 = _mm_and_ps(t1,obc_mask1);
856 dadx2B = _mm_and_ps(t1B,obc_mask1B);
858 _mm_store_ps(dadx,dadx1);
860 _mm_store_ps(dadx,dadx2);
862 _mm_store_ps(dadx,dadx1B);
864 _mm_store_ps(dadx,dadx2B);
867 } /* end normal inner loop */
869 for(;k<nj1-offset;k+=4)
881 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
882 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
883 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
885 dx = _mm_sub_ps(ix, jx);
886 dy = _mm_sub_ps(iy, jy);
887 dz = _mm_sub_ps(iz, jz);
889 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
891 rinv = gmx_mm_invsqrt_ps(rsq);
892 r = _mm_mul_ps(rsq,rinv);
894 /* Compute raj_inv aj1-4 */
895 raj_inv = gmx_mm_inv_ps(raj);
897 /* Evaluate influence of atom aj -> ai */
898 t1 = _mm_add_ps(r,sk_aj);
899 obc_mask1 = _mm_cmplt_ps(rai, t1);
901 if(_mm_movemask_ps(obc_mask1))
903 /* If any of the elements has rai<dr+sk, this is executed */
904 t2 = _mm_sub_ps(r,sk_aj);
905 t3 = _mm_sub_ps(sk_aj,r);
907 obc_mask2 = _mm_cmplt_ps(rai, t2);
908 obc_mask3 = _mm_cmplt_ps(rai, t3);
910 uij = gmx_mm_inv_ps(t1);
911 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
912 _mm_andnot_ps(obc_mask2,rai_inv));
913 dlij = _mm_and_ps(one,obc_mask2);
914 uij2 = _mm_mul_ps(uij, uij);
915 uij3 = _mm_mul_ps(uij2,uij);
916 lij2 = _mm_mul_ps(lij, lij);
917 lij3 = _mm_mul_ps(lij2,lij);
918 diff2 = _mm_sub_ps(uij2,lij2);
919 lij_inv = gmx_mm_invsqrt_ps(lij2);
920 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
921 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
922 prod = _mm_mul_ps(onefourth,sk2_rinv);
923 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
924 t1 = _mm_sub_ps(lij,uij);
925 t2 = _mm_mul_ps(diff2,
926 _mm_sub_ps(_mm_mul_ps(onefourth,r),
928 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
929 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
930 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
931 t4 = _mm_and_ps(t4,obc_mask3);
932 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
933 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
934 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
935 _mm_mul_ps(prod,lij3));
937 _mm_mul_ps(onefourth,
938 _mm_add_ps(_mm_mul_ps(lij,rinv),
939 _mm_mul_ps(lij3,r))));
940 t2 = _mm_mul_ps(onefourth,
941 _mm_add_ps(_mm_mul_ps(uij,rinv),
942 _mm_mul_ps(uij3,r)));
944 _mm_add_ps(_mm_mul_ps(half,uij2),
945 _mm_mul_ps(prod,uij3)));
946 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
947 _mm_mul_ps(rinv,rinv));
949 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
951 _mm_mul_ps(sk2_rinv,rinv))));
952 t1 = _mm_mul_ps(rinv,
953 _mm_add_ps(_mm_mul_ps(dlij,t1),
956 dadx1 = _mm_and_ps(t1,obc_mask1);
960 dadx1 = _mm_setzero_ps();
963 /* Evaluate influence of atom ai -> aj */
964 t1 = _mm_add_ps(r,sk_ai);
965 obc_mask1 = _mm_cmplt_ps(raj, t1);
967 if(_mm_movemask_ps(obc_mask1))
969 t2 = _mm_sub_ps(r,sk_ai);
970 t3 = _mm_sub_ps(sk_ai,r);
971 obc_mask2 = _mm_cmplt_ps(raj, t2);
972 obc_mask3 = _mm_cmplt_ps(raj, t3);
974 uij = gmx_mm_inv_ps(t1);
975 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
976 _mm_andnot_ps(obc_mask2,raj_inv));
977 dlij = _mm_and_ps(one,obc_mask2);
978 uij2 = _mm_mul_ps(uij, uij);
979 uij3 = _mm_mul_ps(uij2,uij);
980 lij2 = _mm_mul_ps(lij, lij);
981 lij3 = _mm_mul_ps(lij2,lij);
982 diff2 = _mm_sub_ps(uij2,lij2);
983 lij_inv = gmx_mm_invsqrt_ps(lij2);
984 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
985 prod = _mm_mul_ps(onefourth,sk2_rinv);
986 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
987 t1 = _mm_sub_ps(lij,uij);
988 t2 = _mm_mul_ps(diff2,
989 _mm_sub_ps(_mm_mul_ps(onefourth,r),
991 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
992 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
993 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
994 t4 = _mm_and_ps(t4,obc_mask3);
995 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
997 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
999 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1000 _mm_mul_ps(prod,lij3));
1002 _mm_mul_ps(onefourth,
1003 _mm_add_ps(_mm_mul_ps(lij,rinv),
1004 _mm_mul_ps(lij3,r))));
1005 t2 = _mm_mul_ps(onefourth,
1006 _mm_add_ps(_mm_mul_ps(uij,rinv),
1007 _mm_mul_ps(uij3,r)));
1009 _mm_add_ps(_mm_mul_ps(half,uij2),
1010 _mm_mul_ps(prod,uij3)));
1011 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1012 _mm_mul_ps(rinv,rinv));
1014 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1016 _mm_mul_ps(sk2_rinv,rinv))));
1017 t1 = _mm_mul_ps(rinv,
1018 _mm_add_ps(_mm_mul_ps(dlij,t1),
1019 _mm_add_ps(t2,t3)));
1020 dadx2 = _mm_and_ps(t1,obc_mask1);
1024 dadx2 = _mm_setzero_ps();
1027 _mm_store_ps(dadx,dadx1);
1029 _mm_store_ps(dadx,dadx2);
1031 } /* end normal inner loop */
1039 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1040 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1041 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1050 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1051 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1052 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1057 /* offset must be 3 */
1064 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1065 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1066 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1070 dx = _mm_sub_ps(ix, jx);
1071 dy = _mm_sub_ps(iy, jy);
1072 dz = _mm_sub_ps(iz, jz);
1074 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
1076 rinv = gmx_mm_invsqrt_ps(rsq);
1077 r = _mm_mul_ps(rsq,rinv);
1079 /* Compute raj_inv aj1-4 */
1080 raj_inv = gmx_mm_inv_ps(raj);
1082 /* Evaluate influence of atom aj -> ai */
1083 t1 = _mm_add_ps(r,sk_aj);
1084 obc_mask1 = _mm_cmplt_ps(rai, t1);
1085 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1087 if(_mm_movemask_ps(obc_mask1))
1089 t2 = _mm_sub_ps(r,sk_aj);
1090 t3 = _mm_sub_ps(sk_aj,r);
1091 obc_mask2 = _mm_cmplt_ps(rai, t2);
1092 obc_mask3 = _mm_cmplt_ps(rai, t3);
1094 uij = gmx_mm_inv_ps(t1);
1095 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1096 _mm_andnot_ps(obc_mask2,rai_inv));
1097 dlij = _mm_and_ps(one,obc_mask2);
1098 uij2 = _mm_mul_ps(uij, uij);
1099 uij3 = _mm_mul_ps(uij2,uij);
1100 lij2 = _mm_mul_ps(lij, lij);
1101 lij3 = _mm_mul_ps(lij2,lij);
1102 diff2 = _mm_sub_ps(uij2,lij2);
1103 lij_inv = gmx_mm_invsqrt_ps(lij2);
1104 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
1105 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
1106 prod = _mm_mul_ps(onefourth,sk2_rinv);
1107 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1108 t1 = _mm_sub_ps(lij,uij);
1109 t2 = _mm_mul_ps(diff2,
1110 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1112 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1113 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1114 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1115 t4 = _mm_and_ps(t4,obc_mask3);
1116 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1117 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1118 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1119 _mm_mul_ps(prod,lij3));
1121 _mm_mul_ps(onefourth,
1122 _mm_add_ps(_mm_mul_ps(lij,rinv),
1123 _mm_mul_ps(lij3,r))));
1124 t2 = _mm_mul_ps(onefourth,
1125 _mm_add_ps(_mm_mul_ps(uij,rinv),
1126 _mm_mul_ps(uij3,r)));
1128 _mm_add_ps(_mm_mul_ps(half,uij2),
1129 _mm_mul_ps(prod,uij3)));
1130 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1131 _mm_mul_ps(rinv,rinv));
1133 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1135 _mm_mul_ps(sk2_rinv,rinv))));
1136 t1 = _mm_mul_ps(rinv,
1137 _mm_add_ps(_mm_mul_ps(dlij,t1),
1138 _mm_add_ps(t2,t3)));
1139 dadx1 = _mm_and_ps(t1,obc_mask1);
1143 dadx1 = _mm_setzero_ps();
1146 /* Evaluate influence of atom ai -> aj */
1147 t1 = _mm_add_ps(r,sk_ai);
1148 obc_mask1 = _mm_cmplt_ps(raj, t1);
1149 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1151 if(_mm_movemask_ps(obc_mask1))
1153 t2 = _mm_sub_ps(r,sk_ai);
1154 t3 = _mm_sub_ps(sk_ai,r);
1155 obc_mask2 = _mm_cmplt_ps(raj, t2);
1156 obc_mask3 = _mm_cmplt_ps(raj, t3);
1158 uij = gmx_mm_inv_ps(t1);
1159 lij = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1160 _mm_andnot_ps(obc_mask2,raj_inv));
1161 dlij = _mm_and_ps(one,obc_mask2);
1162 uij2 = _mm_mul_ps(uij, uij);
1163 uij3 = _mm_mul_ps(uij2,uij);
1164 lij2 = _mm_mul_ps(lij, lij);
1165 lij3 = _mm_mul_ps(lij2,lij);
1166 diff2 = _mm_sub_ps(uij2,lij2);
1167 lij_inv = gmx_mm_invsqrt_ps(lij2);
1168 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
1169 prod = _mm_mul_ps(onefourth,sk2_rinv);
1170 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1171 t1 = _mm_sub_ps(lij,uij);
1172 t2 = _mm_mul_ps(diff2,
1173 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1175 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1176 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1177 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1178 t4 = _mm_and_ps(t4,obc_mask3);
1179 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1181 tmp = _mm_and_ps(t1,obc_mask1);
1183 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1184 _mm_mul_ps(prod,lij3));
1186 _mm_mul_ps(onefourth,
1187 _mm_add_ps(_mm_mul_ps(lij,rinv),
1188 _mm_mul_ps(lij3,r))));
1189 t2 = _mm_mul_ps(onefourth,
1190 _mm_add_ps(_mm_mul_ps(uij,rinv),
1191 _mm_mul_ps(uij3,r)));
1193 _mm_add_ps(_mm_mul_ps(half,uij2),
1194 _mm_mul_ps(prod,uij3)));
1195 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1196 _mm_mul_ps(rinv,rinv));
1198 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1200 _mm_mul_ps(sk2_rinv,rinv))));
1201 t1 = _mm_mul_ps(rinv,
1202 _mm_add_ps(_mm_mul_ps(dlij,t1),
1203 _mm_add_ps(t2,t3)));
1204 dadx2 = _mm_and_ps(t1,obc_mask1);
1208 dadx2 = _mm_setzero_ps();
1209 tmp = _mm_setzero_ps();
1212 _mm_store_ps(dadx,dadx1);
1214 _mm_store_ps(dadx,dadx2);
1219 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1223 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1227 /* offset must be 3 */
1228 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1232 GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1236 /* Parallel summations */
1239 gmx_sum(natoms, work, cr);
1241 else if(DOMAINDECOMP(cr))
1243 dd_atom_sum_real(cr->dd, work);
1246 if(gb_algorithm==egbHCT)
1249 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1251 if(born->use[i] != 0)
1253 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1254 sum = 1.0/rr - work[i];
1255 min_rad = rr + doffset;
1258 born->bRad[i] = rad > min_rad ? rad : min_rad;
1259 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1263 /* Extra communication required for DD */
1264 if(DOMAINDECOMP(cr))
1266 dd_atom_spread_real(cr->dd, born->bRad);
1267 dd_atom_spread_real(cr->dd, fr->invsqrta);
1273 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1275 if(born->use[i] != 0)
1277 rr = top->atomtypes.gb_radius[md->typeA[i]];
1285 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1286 born->bRad[i] = rr_inv - tsum*rr_inv2;
1287 born->bRad[i] = 1.0 / born->bRad[i];
1289 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1291 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1292 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1295 /* Extra (local) communication required for DD */
1296 if(DOMAINDECOMP(cr))
1298 dd_atom_spread_real(cr->dd, born->bRad);
1299 dd_atom_spread_real(cr->dd, fr->invsqrta);
1300 dd_atom_spread_real(cr->dd, born->drobc);
1311 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1312 float *x, float *f, float *fshift, float *shiftvec,
1313 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1315 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1316 int jnrA,jnrB,jnrC,jnrD;
1317 int j3A,j3B,j3C,j3D;
1318 int jnrE,jnrF,jnrG,jnrH;
1319 int j3E,j3F,j3G,j3H;
1322 float rbi,shX,shY,shZ;
1334 __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1335 __m128 xmm1,xmm2,xmm3;
1337 const __m128 two = _mm_set1_ps(2.0f);
1343 /* Loop to get the proper form for the Born radius term, sse style */
1349 if(gb_algorithm==egbSTILL)
1353 rbi = born->bRad[i];
1354 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1357 else if(gb_algorithm==egbHCT)
1361 rbi = born->bRad[i];
1362 rb[i] = rbi * rbi * dvda[i];
1365 else if(gb_algorithm==egbOBC)
1369 rbi = born->bRad[i];
1370 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1374 jz = _mm_setzero_ps();
1376 n = j3A = j3B = j3C = j3D = 0;
1378 for(i=0;i<nl->nri;i++)
1382 is3 = 3*nl->shift[i];
1383 shX = shiftvec[is3];
1384 shY = shiftvec[is3+1];
1385 shZ = shiftvec[is3+2];
1386 nj0 = nl->jindex[i];
1387 nj1 = nl->jindex[i+1];
1389 ix = _mm_set1_ps(shX+x[ii3+0]);
1390 iy = _mm_set1_ps(shY+x[ii3+1]);
1391 iz = _mm_set1_ps(shZ+x[ii3+2]);
1393 offset = (nj1-nj0)%4;
1395 rbai = _mm_load1_ps(rb+ii);
1396 fix = _mm_setzero_ps();
1397 fiy = _mm_setzero_ps();
1398 fiz = _mm_setzero_ps();
1401 for(k=nj0;k<nj1-offset;k+=4)
1413 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1415 dx = _mm_sub_ps(ix,jx);
1416 dy = _mm_sub_ps(iy,jy);
1417 dz = _mm_sub_ps(iz,jz);
1419 GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1421 /* load chain rule terms for j1-4 */
1422 f_gb = _mm_load_ps(dadx);
1424 f_gb_ai = _mm_load_ps(dadx);
1427 /* calculate scalar force */
1428 f_gb = _mm_mul_ps(f_gb,rbai);
1429 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1430 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1432 tx = _mm_mul_ps(f_gb,dx);
1433 ty = _mm_mul_ps(f_gb,dy);
1434 tz = _mm_mul_ps(f_gb,dz);
1436 fix = _mm_add_ps(fix,tx);
1437 fiy = _mm_add_ps(fiy,ty);
1438 fiz = _mm_add_ps(fiz,tz);
1440 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1443 /*deal with odd elements */
1450 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1451 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1459 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1460 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1464 /* offset must be 3 */
1471 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1472 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1475 dx = _mm_sub_ps(ix,jx);
1476 dy = _mm_sub_ps(iy,jy);
1477 dz = _mm_sub_ps(iz,jz);
1479 /* load chain rule terms for j1-4 */
1480 f_gb = _mm_load_ps(dadx);
1482 f_gb_ai = _mm_load_ps(dadx);
1485 /* calculate scalar force */
1486 f_gb = _mm_mul_ps(f_gb,rbai);
1487 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1488 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1490 tx = _mm_mul_ps(f_gb,dx);
1491 ty = _mm_mul_ps(f_gb,dy);
1492 tz = _mm_mul_ps(f_gb,dz);
1494 fix = _mm_add_ps(fix,tx);
1495 fiy = _mm_add_ps(fiy,ty);
1496 fiz = _mm_add_ps(fiz,tz);
1500 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1504 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1508 /* offset must be 3 */
1509 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1513 /* fix/fiy/fiz now contain four partial force terms, that all should be
1514 * added to the i particle forces and shift forces.
1516 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1524 /* keep compiler happy */
1525 int genborn_sse_dummy;
1527 #endif /* SSE intrinsics available */