2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
56 #include "gmx_fatal.h"
57 #include "mtop_util.h"
68 /* Only compile this file if SSE intrinsics are available */
69 #if 0 && defined (GMX_X86_SSE2)
71 #include <gmx_sse2_single.h>
72 #include <emmintrin.h>
74 #include "genborn_sse2_single.h"
78 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
79 int natoms, gmx_localtop_t *top,
80 const t_atomtypes *atype, float *x, t_nblist *nl,
83 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
84 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
85 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
106 __m128 rsq,rinv,rinv2,rinv4,rinv6;
107 __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
108 __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
109 __m128 ratioB,rajB,vajB,rvdwB;
110 __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
111 __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
112 __m128 mask,icf4,icf6,mask_cmp;
113 __m128 icf4B,icf6B,mask_cmpB;
115 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
116 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
117 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
119 const __m128 half = _mm_set1_ps(0.5f);
120 const __m128 three = _mm_set1_ps(3.0f);
121 const __m128 one = _mm_set1_ps(1.0f);
122 const __m128 two = _mm_set1_ps(2.0f);
123 const __m128 zero = _mm_set1_ps(0.0f);
124 const __m128 four = _mm_set1_ps(4.0f);
126 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
127 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
128 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
130 factor = 0.5 * ONE_4PI_EPS0;
132 gb_radius = born->gb_radius;
134 work = born->gpol_still_work;
136 shiftvec = fr->shift_vec[0];
139 jnrA = jnrB = jnrC = jnrD = 0;
140 jx = _mm_setzero_ps();
141 jy = _mm_setzero_ps();
142 jz = _mm_setzero_ps();
146 for(i=0;i<natoms;i++)
151 for(i=0;i<nl->nri;i++)
155 is3 = 3*nl->shift[i];
157 shY = shiftvec[is3+1];
158 shZ = shiftvec[is3+2];
160 nj1 = nl->jindex[i+1];
162 ix = _mm_set1_ps(shX+x[ii3+0]);
163 iy = _mm_set1_ps(shY+x[ii3+1]);
164 iz = _mm_set1_ps(shZ+x[ii3+2]);
166 offset = (nj1-nj0)%4;
168 /* Polarization energy for atom ai */
169 gpi = _mm_setzero_ps();
171 rai = _mm_load1_ps(gb_radius+ii);
172 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
174 for(k=nj0;k<nj1-4-offset;k+=8)
194 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
195 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
197 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
198 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
199 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
200 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
202 dx = _mm_sub_ps(ix,jx);
203 dy = _mm_sub_ps(iy,jy);
204 dz = _mm_sub_ps(iz,jz);
205 dxB = _mm_sub_ps(ix,jxB);
206 dyB = _mm_sub_ps(iy,jyB);
207 dzB = _mm_sub_ps(iz,jzB);
209 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
210 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
211 rinv = gmx_mm_invsqrt_ps(rsq);
212 rinvB = gmx_mm_invsqrt_ps(rsqB);
213 rinv2 = _mm_mul_ps(rinv,rinv);
214 rinv2B = _mm_mul_ps(rinvB,rinvB);
215 rinv4 = _mm_mul_ps(rinv2,rinv2);
216 rinv4B = _mm_mul_ps(rinv2B,rinv2B);
217 rinv6 = _mm_mul_ps(rinv4,rinv2);
218 rinv6B = _mm_mul_ps(rinv4B,rinv2B);
220 rvdw = _mm_add_ps(rai,raj);
221 rvdwB = _mm_add_ps(rai,rajB);
222 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
223 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
225 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
226 mask_cmpB = _mm_cmple_ps(ratioB,still_p5inv);
228 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
229 if( 0 == _mm_movemask_ps(mask_cmp) )
231 /* if ratio>still_p5inv for ALL elements */
233 dccf = _mm_setzero_ps();
237 ratio = _mm_min_ps(ratio,still_p5inv);
238 theta = _mm_mul_ps(ratio,still_pip5);
239 gmx_mm_sincos_ps(theta,&sinq,&cosq);
240 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
241 ccf = _mm_mul_ps(term,term);
242 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
243 _mm_mul_ps(sinq,theta));
245 if( 0 == _mm_movemask_ps(mask_cmpB) )
247 /* if ratio>still_p5inv for ALL elements */
249 dccfB = _mm_setzero_ps();
253 ratioB = _mm_min_ps(ratioB,still_p5inv);
254 thetaB = _mm_mul_ps(ratioB,still_pip5);
255 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
256 termB = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
257 ccfB = _mm_mul_ps(termB,termB);
258 dccfB = _mm_mul_ps(_mm_mul_ps(two,termB),
259 _mm_mul_ps(sinqB,thetaB));
262 prod = _mm_mul_ps(still_p4,vaj);
263 prodB = _mm_mul_ps(still_p4,vajB);
264 icf4 = _mm_mul_ps(ccf,rinv4);
265 icf4B = _mm_mul_ps(ccfB,rinv4B);
266 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
267 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
269 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
270 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
272 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
274 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
276 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
278 _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
280 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
284 for(;k<nj1-offset;k+=4)
296 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
298 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
299 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
301 dx = _mm_sub_ps(ix,jx);
302 dy = _mm_sub_ps(iy,jy);
303 dz = _mm_sub_ps(iz,jz);
305 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
306 rinv = gmx_mm_invsqrt_ps(rsq);
307 rinv2 = _mm_mul_ps(rinv,rinv);
308 rinv4 = _mm_mul_ps(rinv2,rinv2);
309 rinv6 = _mm_mul_ps(rinv4,rinv2);
311 rvdw = _mm_add_ps(rai,raj);
312 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
314 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
316 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
317 if(0 == _mm_movemask_ps(mask_cmp))
319 /* if ratio>still_p5inv for ALL elements */
321 dccf = _mm_setzero_ps();
325 ratio = _mm_min_ps(ratio,still_p5inv);
326 theta = _mm_mul_ps(ratio,still_pip5);
327 gmx_mm_sincos_ps(theta,&sinq,&cosq);
328 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
329 ccf = _mm_mul_ps(term,term);
330 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
331 _mm_mul_ps(sinq,theta));
334 prod = _mm_mul_ps(still_p4,vaj);
335 icf4 = _mm_mul_ps(ccf,rinv4);
336 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
338 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
340 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
342 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
344 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
354 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
355 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
356 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
365 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
366 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
367 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
372 /* offset must be 3 */
379 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
380 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
381 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
385 dx = _mm_sub_ps(ix,jx);
386 dy = _mm_sub_ps(iy,jy);
387 dz = _mm_sub_ps(iz,jz);
389 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
390 rinv = gmx_mm_invsqrt_ps(rsq);
391 rinv2 = _mm_mul_ps(rinv,rinv);
392 rinv4 = _mm_mul_ps(rinv2,rinv2);
393 rinv6 = _mm_mul_ps(rinv4,rinv2);
395 rvdw = _mm_add_ps(rai,raj);
396 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
398 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
400 if(0 == _mm_movemask_ps(mask_cmp))
402 /* if ratio>still_p5inv for ALL elements */
404 dccf = _mm_setzero_ps();
408 ratio = _mm_min_ps(ratio,still_p5inv);
409 theta = _mm_mul_ps(ratio,still_pip5);
410 gmx_mm_sincos_ps(theta,&sinq,&cosq);
411 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
412 ccf = _mm_mul_ps(term,term);
413 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
414 _mm_mul_ps(sinq,theta));
417 prod = _mm_mul_ps(still_p4,vaj);
418 icf4 = _mm_mul_ps(ccf,rinv4);
419 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
421 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
423 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
425 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
428 tmp = _mm_mul_ps(prod_ai,icf4);
432 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
436 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
440 /* offset must be 3 */
441 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
444 GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
447 /* Sum up the polarization energy from other nodes */
450 gmx_sum(natoms, work, cr);
452 else if(DOMAINDECOMP(cr))
454 dd_atom_sum_real(cr->dd, work);
457 /* Compute the radii */
458 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
460 if(born->use[i] != 0)
462 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
463 gpi2 = gpi_ai * gpi_ai;
464 born->bRad[i] = factor*gmx_invsqrt(gpi2);
465 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
469 /* Extra (local) communication required for DD */
472 dd_atom_spread_real(cr->dd, born->bRad);
473 dd_atom_spread_real(cr->dd, fr->invsqrta);
481 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
482 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
484 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
485 int jnrA,jnrB,jnrC,jnrD;
487 int jnrE,jnrF,jnrG,jnrH;
490 float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
491 float sum_ai2, sum_ai3,tsum,tchain,doffset;
500 __m128 ix,iy,iz,jx,jy,jz;
501 __m128 dx,dy,dz,t1,t2,t3,t4;
503 __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
504 __m128 uij,lij2,uij2,lij3,uij3,diff2;
505 __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
506 __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
510 __m128 obc_mask1,obc_mask2,obc_mask3;
511 __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
512 __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
513 __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
514 __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
515 __m128 lij_invB,sk2_invB,prodB;
516 __m128 sk_ajB,sk2_ajB,sk2_rinvB;
517 __m128 dadx1B,dadx2B;
519 __m128 obc_mask1B,obc_mask2B,obc_mask3B;
521 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
522 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
523 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
525 __m128 oneeighth = _mm_set1_ps(0.125);
526 __m128 onefourth = _mm_set1_ps(0.25);
528 const __m128 half = _mm_set1_ps(0.5f);
529 const __m128 three = _mm_set1_ps(3.0f);
530 const __m128 one = _mm_set1_ps(1.0f);
531 const __m128 two = _mm_set1_ps(2.0f);
532 const __m128 zero = _mm_set1_ps(0.0f);
533 const __m128 neg = _mm_set1_ps(-1.0f);
535 /* Set the dielectric offset */
536 doffset = born->gb_doffset;
537 gb_radius = born->gb_radius;
538 obc_param = born->param;
539 work = born->gpol_hct_work;
542 shiftvec = fr->shift_vec[0];
544 jx = _mm_setzero_ps();
545 jy = _mm_setzero_ps();
546 jz = _mm_setzero_ps();
548 jnrA = jnrB = jnrC = jnrD = 0;
550 for(i=0;i<born->nr;i++)
555 for(i=0;i<nl->nri;i++)
559 is3 = 3*nl->shift[i];
561 shY = shiftvec[is3+1];
562 shZ = shiftvec[is3+2];
564 nj1 = nl->jindex[i+1];
566 ix = _mm_set1_ps(shX+x[ii3+0]);
567 iy = _mm_set1_ps(shY+x[ii3+1]);
568 iz = _mm_set1_ps(shZ+x[ii3+2]);
570 offset = (nj1-nj0)%4;
572 rai = _mm_load1_ps(gb_radius+ii);
573 rai_inv= gmx_mm_inv_ps(rai);
575 sum_ai = _mm_setzero_ps();
577 sk_ai = _mm_load1_ps(born->param+ii);
578 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
580 for(k=nj0;k<nj1-4-offset;k+=8)
600 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
601 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
602 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
603 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
604 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
605 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
607 dx = _mm_sub_ps(ix, jx);
608 dy = _mm_sub_ps(iy, jy);
609 dz = _mm_sub_ps(iz, jz);
610 dxB = _mm_sub_ps(ix, jxB);
611 dyB = _mm_sub_ps(iy, jyB);
612 dzB = _mm_sub_ps(iz, jzB);
614 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
615 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
617 rinv = gmx_mm_invsqrt_ps(rsq);
618 r = _mm_mul_ps(rsq,rinv);
619 rinvB = gmx_mm_invsqrt_ps(rsqB);
620 rB = _mm_mul_ps(rsqB,rinvB);
622 /* Compute raj_inv aj1-4 */
623 raj_inv = gmx_mm_inv_ps(raj);
624 raj_invB = gmx_mm_inv_ps(rajB);
626 /* Evaluate influence of atom aj -> ai */
627 t1 = _mm_add_ps(r,sk_aj);
628 t2 = _mm_sub_ps(r,sk_aj);
629 t3 = _mm_sub_ps(sk_aj,r);
630 t1B = _mm_add_ps(rB,sk_ajB);
631 t2B = _mm_sub_ps(rB,sk_ajB);
632 t3B = _mm_sub_ps(sk_ajB,rB);
633 obc_mask1 = _mm_cmplt_ps(rai, t1);
634 obc_mask2 = _mm_cmplt_ps(rai, t2);
635 obc_mask3 = _mm_cmplt_ps(rai, t3);
636 obc_mask1B = _mm_cmplt_ps(rai, t1B);
637 obc_mask2B = _mm_cmplt_ps(rai, t2B);
638 obc_mask3B = _mm_cmplt_ps(rai, t3B);
640 uij = gmx_mm_inv_ps(t1);
641 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
642 _mm_andnot_ps(obc_mask2,rai_inv));
643 dlij = _mm_and_ps(one,obc_mask2);
644 uij2 = _mm_mul_ps(uij, uij);
645 uij3 = _mm_mul_ps(uij2,uij);
646 lij2 = _mm_mul_ps(lij, lij);
647 lij3 = _mm_mul_ps(lij2,lij);
649 uijB = gmx_mm_inv_ps(t1B);
650 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
651 _mm_andnot_ps(obc_mask2B,rai_inv));
652 dlijB = _mm_and_ps(one,obc_mask2B);
653 uij2B = _mm_mul_ps(uijB, uijB);
654 uij3B = _mm_mul_ps(uij2B,uijB);
655 lij2B = _mm_mul_ps(lijB, lijB);
656 lij3B = _mm_mul_ps(lij2B,lijB);
658 diff2 = _mm_sub_ps(uij2,lij2);
659 lij_inv = gmx_mm_invsqrt_ps(lij2);
660 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
661 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
662 prod = _mm_mul_ps(onefourth,sk2_rinv);
664 diff2B = _mm_sub_ps(uij2B,lij2B);
665 lij_invB = gmx_mm_invsqrt_ps(lij2B);
666 sk2_ajB = _mm_mul_ps(sk_ajB,sk_ajB);
667 sk2_rinvB = _mm_mul_ps(sk2_ajB,rinvB);
668 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
670 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
671 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
673 t1 = _mm_sub_ps(lij,uij);
674 t2 = _mm_mul_ps(diff2,
675 _mm_sub_ps(_mm_mul_ps(onefourth,r),
677 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
678 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
679 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
680 t4 = _mm_and_ps(t4,obc_mask3);
681 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
683 t1B = _mm_sub_ps(lijB,uijB);
684 t2B = _mm_mul_ps(diff2B,
685 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
687 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
688 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
689 t4B = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
690 t4B = _mm_and_ps(t4B,obc_mask3B);
691 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
693 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
695 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
696 _mm_mul_ps(prod,lij3));
698 _mm_mul_ps(onefourth,
699 _mm_add_ps(_mm_mul_ps(lij,rinv),
700 _mm_mul_ps(lij3,r))));
701 t2 = _mm_mul_ps(onefourth,
702 _mm_add_ps(_mm_mul_ps(uij,rinv),
703 _mm_mul_ps(uij3,r)));
705 _mm_add_ps(_mm_mul_ps(half,uij2),
706 _mm_mul_ps(prod,uij3)));
707 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
708 _mm_mul_ps(rinv,rinv));
710 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
712 _mm_mul_ps(sk2_rinv,rinv))));
713 t1 = _mm_mul_ps(rinv,
714 _mm_add_ps(_mm_mul_ps(dlij,t1),
719 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
720 _mm_mul_ps(prodB,lij3B));
721 t1B = _mm_sub_ps(t1B,
722 _mm_mul_ps(onefourth,
723 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
724 _mm_mul_ps(lij3B,rB))));
725 t2B = _mm_mul_ps(onefourth,
726 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
727 _mm_mul_ps(uij3B,rB)));
728 t2B = _mm_sub_ps(t2B,
729 _mm_add_ps(_mm_mul_ps(half,uij2B),
730 _mm_mul_ps(prodB,uij3B)));
731 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
732 _mm_mul_ps(rinvB,rinvB));
733 t3B = _mm_sub_ps(t3B,
734 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
736 _mm_mul_ps(sk2_rinvB,rinvB))));
737 t1B = _mm_mul_ps(rinvB,
738 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
739 _mm_add_ps(t2B,t3B)));
741 dadx1 = _mm_and_ps(t1,obc_mask1);
742 dadx1B = _mm_and_ps(t1B,obc_mask1B);
745 /* Evaluate influence of atom ai -> aj */
746 t1 = _mm_add_ps(r,sk_ai);
747 t2 = _mm_sub_ps(r,sk_ai);
748 t3 = _mm_sub_ps(sk_ai,r);
749 t1B = _mm_add_ps(rB,sk_ai);
750 t2B = _mm_sub_ps(rB,sk_ai);
751 t3B = _mm_sub_ps(sk_ai,rB);
752 obc_mask1 = _mm_cmplt_ps(raj, t1);
753 obc_mask2 = _mm_cmplt_ps(raj, t2);
754 obc_mask3 = _mm_cmplt_ps(raj, t3);
755 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
756 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
757 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
759 uij = gmx_mm_inv_ps(t1);
760 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
761 _mm_andnot_ps(obc_mask2,raj_inv));
762 dlij = _mm_and_ps(one,obc_mask2);
763 uij2 = _mm_mul_ps(uij, uij);
764 uij3 = _mm_mul_ps(uij2,uij);
765 lij2 = _mm_mul_ps(lij, lij);
766 lij3 = _mm_mul_ps(lij2,lij);
768 uijB = gmx_mm_inv_ps(t1B);
769 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
770 _mm_andnot_ps(obc_mask2B,raj_invB));
771 dlijB = _mm_and_ps(one,obc_mask2B);
772 uij2B = _mm_mul_ps(uijB, uijB);
773 uij3B = _mm_mul_ps(uij2B,uijB);
774 lij2B = _mm_mul_ps(lijB, lijB);
775 lij3B = _mm_mul_ps(lij2B,lijB);
777 diff2 = _mm_sub_ps(uij2,lij2);
778 lij_inv = gmx_mm_invsqrt_ps(lij2);
779 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
780 prod = _mm_mul_ps(onefourth,sk2_rinv);
782 diff2B = _mm_sub_ps(uij2B,lij2B);
783 lij_invB = gmx_mm_invsqrt_ps(lij2B);
784 sk2_rinvB = _mm_mul_ps(sk2_ai,rinvB);
785 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
787 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
788 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
790 t1 = _mm_sub_ps(lij,uij);
791 t2 = _mm_mul_ps(diff2,
792 _mm_sub_ps(_mm_mul_ps(onefourth,r),
794 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
795 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
796 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
797 t4 = _mm_and_ps(t4,obc_mask3);
798 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
800 t1B = _mm_sub_ps(lijB,uijB);
801 t2B = _mm_mul_ps(diff2B,
802 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
804 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
805 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
806 t4B = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
807 t4B = _mm_and_ps(t4B,obc_mask3B);
808 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
810 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
811 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
813 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
814 _mm_mul_ps(prod,lij3));
816 _mm_mul_ps(onefourth,
817 _mm_add_ps(_mm_mul_ps(lij,rinv),
818 _mm_mul_ps(lij3,r))));
819 t2 = _mm_mul_ps(onefourth,
820 _mm_add_ps(_mm_mul_ps(uij,rinv),
821 _mm_mul_ps(uij3,r)));
823 _mm_add_ps(_mm_mul_ps(half,uij2),
824 _mm_mul_ps(prod,uij3)));
825 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
826 _mm_mul_ps(rinv,rinv));
828 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
830 _mm_mul_ps(sk2_rinv,rinv))));
831 t1 = _mm_mul_ps(rinv,
832 _mm_add_ps(_mm_mul_ps(dlij,t1),
836 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
837 _mm_mul_ps(prodB,lij3B));
838 t1B = _mm_sub_ps(t1B,
839 _mm_mul_ps(onefourth,
840 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
841 _mm_mul_ps(lij3B,rB))));
842 t2B = _mm_mul_ps(onefourth,
843 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
844 _mm_mul_ps(uij3B,rB)));
845 t2B = _mm_sub_ps(t2B,
846 _mm_add_ps(_mm_mul_ps(half,uij2B),
847 _mm_mul_ps(prodB,uij3B)));
848 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
849 _mm_mul_ps(rinvB,rinvB));
850 t3B = _mm_sub_ps(t3B,
851 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
853 _mm_mul_ps(sk2_rinvB,rinvB))));
854 t1B = _mm_mul_ps(rinvB,
855 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
856 _mm_add_ps(t2B,t3B)));
859 dadx2 = _mm_and_ps(t1,obc_mask1);
860 dadx2B = _mm_and_ps(t1B,obc_mask1B);
862 _mm_store_ps(dadx,dadx1);
864 _mm_store_ps(dadx,dadx2);
866 _mm_store_ps(dadx,dadx1B);
868 _mm_store_ps(dadx,dadx2B);
871 } /* end normal inner loop */
873 for(;k<nj1-offset;k+=4)
885 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
886 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
887 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
889 dx = _mm_sub_ps(ix, jx);
890 dy = _mm_sub_ps(iy, jy);
891 dz = _mm_sub_ps(iz, jz);
893 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
895 rinv = gmx_mm_invsqrt_ps(rsq);
896 r = _mm_mul_ps(rsq,rinv);
898 /* Compute raj_inv aj1-4 */
899 raj_inv = gmx_mm_inv_ps(raj);
901 /* Evaluate influence of atom aj -> ai */
902 t1 = _mm_add_ps(r,sk_aj);
903 obc_mask1 = _mm_cmplt_ps(rai, t1);
905 if(_mm_movemask_ps(obc_mask1))
907 /* If any of the elements has rai<dr+sk, this is executed */
908 t2 = _mm_sub_ps(r,sk_aj);
909 t3 = _mm_sub_ps(sk_aj,r);
911 obc_mask2 = _mm_cmplt_ps(rai, t2);
912 obc_mask3 = _mm_cmplt_ps(rai, t3);
914 uij = gmx_mm_inv_ps(t1);
915 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
916 _mm_andnot_ps(obc_mask2,rai_inv));
917 dlij = _mm_and_ps(one,obc_mask2);
918 uij2 = _mm_mul_ps(uij, uij);
919 uij3 = _mm_mul_ps(uij2,uij);
920 lij2 = _mm_mul_ps(lij, lij);
921 lij3 = _mm_mul_ps(lij2,lij);
922 diff2 = _mm_sub_ps(uij2,lij2);
923 lij_inv = gmx_mm_invsqrt_ps(lij2);
924 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
925 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
926 prod = _mm_mul_ps(onefourth,sk2_rinv);
927 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
928 t1 = _mm_sub_ps(lij,uij);
929 t2 = _mm_mul_ps(diff2,
930 _mm_sub_ps(_mm_mul_ps(onefourth,r),
932 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
933 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
934 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
935 t4 = _mm_and_ps(t4,obc_mask3);
936 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
937 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
938 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
939 _mm_mul_ps(prod,lij3));
941 _mm_mul_ps(onefourth,
942 _mm_add_ps(_mm_mul_ps(lij,rinv),
943 _mm_mul_ps(lij3,r))));
944 t2 = _mm_mul_ps(onefourth,
945 _mm_add_ps(_mm_mul_ps(uij,rinv),
946 _mm_mul_ps(uij3,r)));
948 _mm_add_ps(_mm_mul_ps(half,uij2),
949 _mm_mul_ps(prod,uij3)));
950 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
951 _mm_mul_ps(rinv,rinv));
953 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
955 _mm_mul_ps(sk2_rinv,rinv))));
956 t1 = _mm_mul_ps(rinv,
957 _mm_add_ps(_mm_mul_ps(dlij,t1),
960 dadx1 = _mm_and_ps(t1,obc_mask1);
964 dadx1 = _mm_setzero_ps();
967 /* Evaluate influence of atom ai -> aj */
968 t1 = _mm_add_ps(r,sk_ai);
969 obc_mask1 = _mm_cmplt_ps(raj, t1);
971 if(_mm_movemask_ps(obc_mask1))
973 t2 = _mm_sub_ps(r,sk_ai);
974 t3 = _mm_sub_ps(sk_ai,r);
975 obc_mask2 = _mm_cmplt_ps(raj, t2);
976 obc_mask3 = _mm_cmplt_ps(raj, t3);
978 uij = gmx_mm_inv_ps(t1);
979 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
980 _mm_andnot_ps(obc_mask2,raj_inv));
981 dlij = _mm_and_ps(one,obc_mask2);
982 uij2 = _mm_mul_ps(uij, uij);
983 uij3 = _mm_mul_ps(uij2,uij);
984 lij2 = _mm_mul_ps(lij, lij);
985 lij3 = _mm_mul_ps(lij2,lij);
986 diff2 = _mm_sub_ps(uij2,lij2);
987 lij_inv = gmx_mm_invsqrt_ps(lij2);
988 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
989 prod = _mm_mul_ps(onefourth,sk2_rinv);
990 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
991 t1 = _mm_sub_ps(lij,uij);
992 t2 = _mm_mul_ps(diff2,
993 _mm_sub_ps(_mm_mul_ps(onefourth,r),
995 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
996 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
997 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
998 t4 = _mm_and_ps(t4,obc_mask3);
999 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1001 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
1003 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1004 _mm_mul_ps(prod,lij3));
1006 _mm_mul_ps(onefourth,
1007 _mm_add_ps(_mm_mul_ps(lij,rinv),
1008 _mm_mul_ps(lij3,r))));
1009 t2 = _mm_mul_ps(onefourth,
1010 _mm_add_ps(_mm_mul_ps(uij,rinv),
1011 _mm_mul_ps(uij3,r)));
1013 _mm_add_ps(_mm_mul_ps(half,uij2),
1014 _mm_mul_ps(prod,uij3)));
1015 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1016 _mm_mul_ps(rinv,rinv));
1018 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1020 _mm_mul_ps(sk2_rinv,rinv))));
1021 t1 = _mm_mul_ps(rinv,
1022 _mm_add_ps(_mm_mul_ps(dlij,t1),
1023 _mm_add_ps(t2,t3)));
1024 dadx2 = _mm_and_ps(t1,obc_mask1);
1028 dadx2 = _mm_setzero_ps();
1031 _mm_store_ps(dadx,dadx1);
1033 _mm_store_ps(dadx,dadx2);
1035 } /* end normal inner loop */
1043 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1044 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1045 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1054 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1055 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1056 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1061 /* offset must be 3 */
1068 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1069 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1070 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1074 dx = _mm_sub_ps(ix, jx);
1075 dy = _mm_sub_ps(iy, jy);
1076 dz = _mm_sub_ps(iz, jz);
1078 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
1080 rinv = gmx_mm_invsqrt_ps(rsq);
1081 r = _mm_mul_ps(rsq,rinv);
1083 /* Compute raj_inv aj1-4 */
1084 raj_inv = gmx_mm_inv_ps(raj);
1086 /* Evaluate influence of atom aj -> ai */
1087 t1 = _mm_add_ps(r,sk_aj);
1088 obc_mask1 = _mm_cmplt_ps(rai, t1);
1089 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1091 if(_mm_movemask_ps(obc_mask1))
1093 t2 = _mm_sub_ps(r,sk_aj);
1094 t3 = _mm_sub_ps(sk_aj,r);
1095 obc_mask2 = _mm_cmplt_ps(rai, t2);
1096 obc_mask3 = _mm_cmplt_ps(rai, t3);
1098 uij = gmx_mm_inv_ps(t1);
1099 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1100 _mm_andnot_ps(obc_mask2,rai_inv));
1101 dlij = _mm_and_ps(one,obc_mask2);
1102 uij2 = _mm_mul_ps(uij, uij);
1103 uij3 = _mm_mul_ps(uij2,uij);
1104 lij2 = _mm_mul_ps(lij, lij);
1105 lij3 = _mm_mul_ps(lij2,lij);
1106 diff2 = _mm_sub_ps(uij2,lij2);
1107 lij_inv = gmx_mm_invsqrt_ps(lij2);
1108 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
1109 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
1110 prod = _mm_mul_ps(onefourth,sk2_rinv);
1111 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1112 t1 = _mm_sub_ps(lij,uij);
1113 t2 = _mm_mul_ps(diff2,
1114 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1116 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1117 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1118 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1119 t4 = _mm_and_ps(t4,obc_mask3);
1120 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1121 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1122 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1123 _mm_mul_ps(prod,lij3));
1125 _mm_mul_ps(onefourth,
1126 _mm_add_ps(_mm_mul_ps(lij,rinv),
1127 _mm_mul_ps(lij3,r))));
1128 t2 = _mm_mul_ps(onefourth,
1129 _mm_add_ps(_mm_mul_ps(uij,rinv),
1130 _mm_mul_ps(uij3,r)));
1132 _mm_add_ps(_mm_mul_ps(half,uij2),
1133 _mm_mul_ps(prod,uij3)));
1134 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1135 _mm_mul_ps(rinv,rinv));
1137 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1139 _mm_mul_ps(sk2_rinv,rinv))));
1140 t1 = _mm_mul_ps(rinv,
1141 _mm_add_ps(_mm_mul_ps(dlij,t1),
1142 _mm_add_ps(t2,t3)));
1143 dadx1 = _mm_and_ps(t1,obc_mask1);
1147 dadx1 = _mm_setzero_ps();
1150 /* Evaluate influence of atom ai -> aj */
1151 t1 = _mm_add_ps(r,sk_ai);
1152 obc_mask1 = _mm_cmplt_ps(raj, t1);
1153 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1155 if(_mm_movemask_ps(obc_mask1))
1157 t2 = _mm_sub_ps(r,sk_ai);
1158 t3 = _mm_sub_ps(sk_ai,r);
1159 obc_mask2 = _mm_cmplt_ps(raj, t2);
1160 obc_mask3 = _mm_cmplt_ps(raj, t3);
1162 uij = gmx_mm_inv_ps(t1);
1163 lij = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1164 _mm_andnot_ps(obc_mask2,raj_inv));
1165 dlij = _mm_and_ps(one,obc_mask2);
1166 uij2 = _mm_mul_ps(uij, uij);
1167 uij3 = _mm_mul_ps(uij2,uij);
1168 lij2 = _mm_mul_ps(lij, lij);
1169 lij3 = _mm_mul_ps(lij2,lij);
1170 diff2 = _mm_sub_ps(uij2,lij2);
1171 lij_inv = gmx_mm_invsqrt_ps(lij2);
1172 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
1173 prod = _mm_mul_ps(onefourth,sk2_rinv);
1174 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1175 t1 = _mm_sub_ps(lij,uij);
1176 t2 = _mm_mul_ps(diff2,
1177 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1179 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1180 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1181 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1182 t4 = _mm_and_ps(t4,obc_mask3);
1183 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1185 tmp = _mm_and_ps(t1,obc_mask1);
1187 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1188 _mm_mul_ps(prod,lij3));
1190 _mm_mul_ps(onefourth,
1191 _mm_add_ps(_mm_mul_ps(lij,rinv),
1192 _mm_mul_ps(lij3,r))));
1193 t2 = _mm_mul_ps(onefourth,
1194 _mm_add_ps(_mm_mul_ps(uij,rinv),
1195 _mm_mul_ps(uij3,r)));
1197 _mm_add_ps(_mm_mul_ps(half,uij2),
1198 _mm_mul_ps(prod,uij3)));
1199 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1200 _mm_mul_ps(rinv,rinv));
1202 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1204 _mm_mul_ps(sk2_rinv,rinv))));
1205 t1 = _mm_mul_ps(rinv,
1206 _mm_add_ps(_mm_mul_ps(dlij,t1),
1207 _mm_add_ps(t2,t3)));
1208 dadx2 = _mm_and_ps(t1,obc_mask1);
1212 dadx2 = _mm_setzero_ps();
1213 tmp = _mm_setzero_ps();
1216 _mm_store_ps(dadx,dadx1);
1218 _mm_store_ps(dadx,dadx2);
1223 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1227 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1231 /* offset must be 3 */
1232 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1236 GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1240 /* Parallel summations */
1243 gmx_sum(natoms, work, cr);
1245 else if(DOMAINDECOMP(cr))
1247 dd_atom_sum_real(cr->dd, work);
1250 if(gb_algorithm==egbHCT)
1253 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1255 if(born->use[i] != 0)
1257 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1258 sum = 1.0/rr - work[i];
1259 min_rad = rr + doffset;
1262 born->bRad[i] = rad > min_rad ? rad : min_rad;
1263 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1267 /* Extra communication required for DD */
1268 if(DOMAINDECOMP(cr))
1270 dd_atom_spread_real(cr->dd, born->bRad);
1271 dd_atom_spread_real(cr->dd, fr->invsqrta);
1277 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1279 if(born->use[i] != 0)
1281 rr = top->atomtypes.gb_radius[md->typeA[i]];
1289 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1290 born->bRad[i] = rr_inv - tsum*rr_inv2;
1291 born->bRad[i] = 1.0 / born->bRad[i];
1293 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1295 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1296 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1299 /* Extra (local) communication required for DD */
1300 if(DOMAINDECOMP(cr))
1302 dd_atom_spread_real(cr->dd, born->bRad);
1303 dd_atom_spread_real(cr->dd, fr->invsqrta);
1304 dd_atom_spread_real(cr->dd, born->drobc);
1315 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1316 float *x, float *f, float *fshift, float *shiftvec,
1317 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1319 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1320 int jnrA,jnrB,jnrC,jnrD;
1321 int j3A,j3B,j3C,j3D;
1322 int jnrE,jnrF,jnrG,jnrH;
1323 int j3E,j3F,j3G,j3H;
1326 float rbi,shX,shY,shZ;
1338 __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1339 __m128 xmm1,xmm2,xmm3;
1341 const __m128 two = _mm_set1_ps(2.0f);
1347 /* Loop to get the proper form for the Born radius term, sse style */
1353 if(gb_algorithm==egbSTILL)
1357 rbi = born->bRad[i];
1358 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1361 else if(gb_algorithm==egbHCT)
1365 rbi = born->bRad[i];
1366 rb[i] = rbi * rbi * dvda[i];
1369 else if(gb_algorithm==egbOBC)
1373 rbi = born->bRad[i];
1374 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1378 jz = _mm_setzero_ps();
1380 n = j3A = j3B = j3C = j3D = 0;
1382 for(i=0;i<nl->nri;i++)
1386 is3 = 3*nl->shift[i];
1387 shX = shiftvec[is3];
1388 shY = shiftvec[is3+1];
1389 shZ = shiftvec[is3+2];
1390 nj0 = nl->jindex[i];
1391 nj1 = nl->jindex[i+1];
1393 ix = _mm_set1_ps(shX+x[ii3+0]);
1394 iy = _mm_set1_ps(shY+x[ii3+1]);
1395 iz = _mm_set1_ps(shZ+x[ii3+2]);
1397 offset = (nj1-nj0)%4;
1399 rbai = _mm_load1_ps(rb+ii);
1400 fix = _mm_setzero_ps();
1401 fiy = _mm_setzero_ps();
1402 fiz = _mm_setzero_ps();
1405 for(k=nj0;k<nj1-offset;k+=4)
1417 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1419 dx = _mm_sub_ps(ix,jx);
1420 dy = _mm_sub_ps(iy,jy);
1421 dz = _mm_sub_ps(iz,jz);
1423 GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1425 /* load chain rule terms for j1-4 */
1426 f_gb = _mm_load_ps(dadx);
1428 f_gb_ai = _mm_load_ps(dadx);
1431 /* calculate scalar force */
1432 f_gb = _mm_mul_ps(f_gb,rbai);
1433 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1434 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1436 tx = _mm_mul_ps(f_gb,dx);
1437 ty = _mm_mul_ps(f_gb,dy);
1438 tz = _mm_mul_ps(f_gb,dz);
1440 fix = _mm_add_ps(fix,tx);
1441 fiy = _mm_add_ps(fiy,ty);
1442 fiz = _mm_add_ps(fiz,tz);
1444 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1447 /*deal with odd elements */
1454 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1455 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1463 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1464 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1468 /* offset must be 3 */
1475 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1476 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1479 dx = _mm_sub_ps(ix,jx);
1480 dy = _mm_sub_ps(iy,jy);
1481 dz = _mm_sub_ps(iz,jz);
1483 /* load chain rule terms for j1-4 */
1484 f_gb = _mm_load_ps(dadx);
1486 f_gb_ai = _mm_load_ps(dadx);
1489 /* calculate scalar force */
1490 f_gb = _mm_mul_ps(f_gb,rbai);
1491 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1492 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1494 tx = _mm_mul_ps(f_gb,dx);
1495 ty = _mm_mul_ps(f_gb,dy);
1496 tz = _mm_mul_ps(f_gb,dz);
1498 fix = _mm_add_ps(fix,tx);
1499 fiy = _mm_add_ps(fiy,ty);
1500 fiz = _mm_add_ps(fiz,tz);
1504 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1508 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1512 /* offset must be 3 */
1513 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1517 /* fix/fiy/fiz now contain four partial force terms, that all should be
1518 * added to the i particle forces and shift forces.
1520 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1528 /* keep compiler happy */
1529 int genborn_sse_dummy;
1531 #endif /* SSE intrinsics available */