3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2008, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
32 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
71 #include "genborn_sse2_single.h"
75 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
76 int natoms, gmx_localtop_t *top,
77 const t_atomtypes *atype, float *x, t_nblist *nl,
80 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
81 int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
82 int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
103 __m128 rsq,rinv,rinv2,rinv4,rinv6;
104 __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
105 __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
106 __m128 ratioB,rajB,vajB,rvdwB;
107 __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
108 __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
109 __m128 mask,icf4,icf6,mask_cmp;
110 __m128 icf4B,icf6B,mask_cmpB;
112 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
113 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
114 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
116 const __m128 half = _mm_set1_ps(0.5f);
117 const __m128 three = _mm_set1_ps(3.0f);
118 const __m128 one = _mm_set1_ps(1.0f);
119 const __m128 two = _mm_set1_ps(2.0f);
120 const __m128 zero = _mm_set1_ps(0.0f);
121 const __m128 four = _mm_set1_ps(4.0f);
123 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
124 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
125 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
127 factor = 0.5 * ONE_4PI_EPS0;
129 gb_radius = born->gb_radius;
131 work = born->gpol_still_work;
133 shiftvec = fr->shift_vec[0];
136 jnrA = jnrB = jnrC = jnrD = 0;
137 jx = _mm_setzero_ps();
138 jy = _mm_setzero_ps();
139 jz = _mm_setzero_ps();
143 for(i=0;i<natoms;i++)
148 for(i=0;i<nl->nri;i++)
152 is3 = 3*nl->shift[i];
154 shY = shiftvec[is3+1];
155 shZ = shiftvec[is3+2];
157 nj1 = nl->jindex[i+1];
159 ix = _mm_set1_ps(shX+x[ii3+0]);
160 iy = _mm_set1_ps(shY+x[ii3+1]);
161 iz = _mm_set1_ps(shZ+x[ii3+2]);
163 offset = (nj1-nj0)%4;
165 /* Polarization energy for atom ai */
166 gpi = _mm_setzero_ps();
168 rai = _mm_load1_ps(gb_radius+ii);
169 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
171 for(k=nj0;k<nj1-4-offset;k+=8)
191 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
192 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
194 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
195 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
196 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
197 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
199 dx = _mm_sub_ps(ix,jx);
200 dy = _mm_sub_ps(iy,jy);
201 dz = _mm_sub_ps(iz,jz);
202 dxB = _mm_sub_ps(ix,jxB);
203 dyB = _mm_sub_ps(iy,jyB);
204 dzB = _mm_sub_ps(iz,jzB);
206 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
207 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
208 rinv = gmx_mm_invsqrt_ps(rsq);
209 rinvB = gmx_mm_invsqrt_ps(rsqB);
210 rinv2 = _mm_mul_ps(rinv,rinv);
211 rinv2B = _mm_mul_ps(rinvB,rinvB);
212 rinv4 = _mm_mul_ps(rinv2,rinv2);
213 rinv4B = _mm_mul_ps(rinv2B,rinv2B);
214 rinv6 = _mm_mul_ps(rinv4,rinv2);
215 rinv6B = _mm_mul_ps(rinv4B,rinv2B);
217 rvdw = _mm_add_ps(rai,raj);
218 rvdwB = _mm_add_ps(rai,rajB);
219 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
220 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
222 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
223 mask_cmpB = _mm_cmple_ps(ratioB,still_p5inv);
225 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
226 if( 0 == _mm_movemask_ps(mask_cmp) )
228 /* if ratio>still_p5inv for ALL elements */
230 dccf = _mm_setzero_ps();
234 ratio = _mm_min_ps(ratio,still_p5inv);
235 theta = _mm_mul_ps(ratio,still_pip5);
236 gmx_mm_sincos_ps(theta,&sinq,&cosq);
237 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
238 ccf = _mm_mul_ps(term,term);
239 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
240 _mm_mul_ps(sinq,theta));
242 if( 0 == _mm_movemask_ps(mask_cmpB) )
244 /* if ratio>still_p5inv for ALL elements */
246 dccfB = _mm_setzero_ps();
250 ratioB = _mm_min_ps(ratioB,still_p5inv);
251 thetaB = _mm_mul_ps(ratioB,still_pip5);
252 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
253 termB = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
254 ccfB = _mm_mul_ps(termB,termB);
255 dccfB = _mm_mul_ps(_mm_mul_ps(two,termB),
256 _mm_mul_ps(sinqB,thetaB));
259 prod = _mm_mul_ps(still_p4,vaj);
260 prodB = _mm_mul_ps(still_p4,vajB);
261 icf4 = _mm_mul_ps(ccf,rinv4);
262 icf4B = _mm_mul_ps(ccfB,rinv4B);
263 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
264 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
266 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
267 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
269 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
271 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
273 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
275 _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
277 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
281 for(;k<nj1-offset;k+=4)
293 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
295 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
296 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
298 dx = _mm_sub_ps(ix,jx);
299 dy = _mm_sub_ps(iy,jy);
300 dz = _mm_sub_ps(iz,jz);
302 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
303 rinv = gmx_mm_invsqrt_ps(rsq);
304 rinv2 = _mm_mul_ps(rinv,rinv);
305 rinv4 = _mm_mul_ps(rinv2,rinv2);
306 rinv6 = _mm_mul_ps(rinv4,rinv2);
308 rvdw = _mm_add_ps(rai,raj);
309 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
311 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
313 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
314 if(0 == _mm_movemask_ps(mask_cmp))
316 /* if ratio>still_p5inv for ALL elements */
318 dccf = _mm_setzero_ps();
322 ratio = _mm_min_ps(ratio,still_p5inv);
323 theta = _mm_mul_ps(ratio,still_pip5);
324 gmx_mm_sincos_ps(theta,&sinq,&cosq);
325 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
326 ccf = _mm_mul_ps(term,term);
327 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
328 _mm_mul_ps(sinq,theta));
331 prod = _mm_mul_ps(still_p4,vaj);
332 icf4 = _mm_mul_ps(ccf,rinv4);
333 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
335 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
337 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
339 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
341 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
351 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
352 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
353 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
362 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
363 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
364 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
369 /* offset must be 3 */
376 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
377 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
378 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
382 dx = _mm_sub_ps(ix,jx);
383 dy = _mm_sub_ps(iy,jy);
384 dz = _mm_sub_ps(iz,jz);
386 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
387 rinv = gmx_mm_invsqrt_ps(rsq);
388 rinv2 = _mm_mul_ps(rinv,rinv);
389 rinv4 = _mm_mul_ps(rinv2,rinv2);
390 rinv6 = _mm_mul_ps(rinv4,rinv2);
392 rvdw = _mm_add_ps(rai,raj);
393 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
395 mask_cmp = _mm_cmple_ps(ratio,still_p5inv);
397 if(0 == _mm_movemask_ps(mask_cmp))
399 /* if ratio>still_p5inv for ALL elements */
401 dccf = _mm_setzero_ps();
405 ratio = _mm_min_ps(ratio,still_p5inv);
406 theta = _mm_mul_ps(ratio,still_pip5);
407 gmx_mm_sincos_ps(theta,&sinq,&cosq);
408 term = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
409 ccf = _mm_mul_ps(term,term);
410 dccf = _mm_mul_ps(_mm_mul_ps(two,term),
411 _mm_mul_ps(sinq,theta));
414 prod = _mm_mul_ps(still_p4,vaj);
415 icf4 = _mm_mul_ps(ccf,rinv4);
416 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
418 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
420 _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
422 _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
425 tmp = _mm_mul_ps(prod_ai,icf4);
429 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
433 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
437 /* offset must be 3 */
438 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
441 GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
444 /* Sum up the polarization energy from other nodes */
447 gmx_sum(natoms, work, cr);
449 else if(DOMAINDECOMP(cr))
451 dd_atom_sum_real(cr->dd, work);
454 /* Compute the radii */
455 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
457 if(born->use[i] != 0)
459 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
460 gpi2 = gpi_ai * gpi_ai;
461 born->bRad[i] = factor*gmx_invsqrt(gpi2);
462 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
466 /* Extra (local) communication required for DD */
469 dd_atom_spread_real(cr->dd, born->bRad);
470 dd_atom_spread_real(cr->dd, fr->invsqrta);
478 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
479 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
481 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
482 int jnrA,jnrB,jnrC,jnrD;
484 int jnrE,jnrF,jnrG,jnrH;
487 float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
488 float sum_ai2, sum_ai3,tsum,tchain,doffset;
497 __m128 ix,iy,iz,jx,jy,jz;
498 __m128 dx,dy,dz,t1,t2,t3,t4;
500 __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
501 __m128 uij,lij2,uij2,lij3,uij3,diff2;
502 __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
503 __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
507 __m128 obc_mask1,obc_mask2,obc_mask3;
508 __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
509 __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
510 __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
511 __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
512 __m128 lij_invB,sk2_invB,prodB;
513 __m128 sk_ajB,sk2_ajB,sk2_rinvB;
514 __m128 dadx1B,dadx2B;
516 __m128 obc_mask1B,obc_mask2B,obc_mask3B;
518 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
519 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
520 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
522 __m128 oneeighth = _mm_set1_ps(0.125);
523 __m128 onefourth = _mm_set1_ps(0.25);
525 const __m128 half = _mm_set1_ps(0.5f);
526 const __m128 three = _mm_set1_ps(3.0f);
527 const __m128 one = _mm_set1_ps(1.0f);
528 const __m128 two = _mm_set1_ps(2.0f);
529 const __m128 zero = _mm_set1_ps(0.0f);
530 const __m128 neg = _mm_set1_ps(-1.0f);
532 /* Set the dielectric offset */
533 doffset = born->gb_doffset;
534 gb_radius = born->gb_radius;
535 obc_param = born->param;
536 work = born->gpol_hct_work;
539 shiftvec = fr->shift_vec[0];
541 jx = _mm_setzero_ps();
542 jy = _mm_setzero_ps();
543 jz = _mm_setzero_ps();
545 jnrA = jnrB = jnrC = jnrD = 0;
547 for(i=0;i<born->nr;i++)
552 for(i=0;i<nl->nri;i++)
556 is3 = 3*nl->shift[i];
558 shY = shiftvec[is3+1];
559 shZ = shiftvec[is3+2];
561 nj1 = nl->jindex[i+1];
563 ix = _mm_set1_ps(shX+x[ii3+0]);
564 iy = _mm_set1_ps(shY+x[ii3+1]);
565 iz = _mm_set1_ps(shZ+x[ii3+2]);
567 offset = (nj1-nj0)%4;
569 rai = _mm_load1_ps(gb_radius+ii);
570 rai_inv= gmx_mm_inv_ps(rai);
572 sum_ai = _mm_setzero_ps();
574 sk_ai = _mm_load1_ps(born->param+ii);
575 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
577 for(k=nj0;k<nj1-4-offset;k+=8)
597 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
598 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
599 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
600 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
601 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
602 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
604 dx = _mm_sub_ps(ix, jx);
605 dy = _mm_sub_ps(iy, jy);
606 dz = _mm_sub_ps(iz, jz);
607 dxB = _mm_sub_ps(ix, jxB);
608 dyB = _mm_sub_ps(iy, jyB);
609 dzB = _mm_sub_ps(iz, jzB);
611 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
612 rsqB = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
614 rinv = gmx_mm_invsqrt_ps(rsq);
615 r = _mm_mul_ps(rsq,rinv);
616 rinvB = gmx_mm_invsqrt_ps(rsqB);
617 rB = _mm_mul_ps(rsqB,rinvB);
619 /* Compute raj_inv aj1-4 */
620 raj_inv = gmx_mm_inv_ps(raj);
621 raj_invB = gmx_mm_inv_ps(rajB);
623 /* Evaluate influence of atom aj -> ai */
624 t1 = _mm_add_ps(r,sk_aj);
625 t2 = _mm_sub_ps(r,sk_aj);
626 t3 = _mm_sub_ps(sk_aj,r);
627 t1B = _mm_add_ps(rB,sk_ajB);
628 t2B = _mm_sub_ps(rB,sk_ajB);
629 t3B = _mm_sub_ps(sk_ajB,rB);
630 obc_mask1 = _mm_cmplt_ps(rai, t1);
631 obc_mask2 = _mm_cmplt_ps(rai, t2);
632 obc_mask3 = _mm_cmplt_ps(rai, t3);
633 obc_mask1B = _mm_cmplt_ps(rai, t1B);
634 obc_mask2B = _mm_cmplt_ps(rai, t2B);
635 obc_mask3B = _mm_cmplt_ps(rai, t3B);
637 uij = gmx_mm_inv_ps(t1);
638 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
639 _mm_andnot_ps(obc_mask2,rai_inv));
640 dlij = _mm_and_ps(one,obc_mask2);
641 uij2 = _mm_mul_ps(uij, uij);
642 uij3 = _mm_mul_ps(uij2,uij);
643 lij2 = _mm_mul_ps(lij, lij);
644 lij3 = _mm_mul_ps(lij2,lij);
646 uijB = gmx_mm_inv_ps(t1B);
647 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
648 _mm_andnot_ps(obc_mask2B,rai_inv));
649 dlijB = _mm_and_ps(one,obc_mask2B);
650 uij2B = _mm_mul_ps(uijB, uijB);
651 uij3B = _mm_mul_ps(uij2B,uijB);
652 lij2B = _mm_mul_ps(lijB, lijB);
653 lij3B = _mm_mul_ps(lij2B,lijB);
655 diff2 = _mm_sub_ps(uij2,lij2);
656 lij_inv = gmx_mm_invsqrt_ps(lij2);
657 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
658 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
659 prod = _mm_mul_ps(onefourth,sk2_rinv);
661 diff2B = _mm_sub_ps(uij2B,lij2B);
662 lij_invB = gmx_mm_invsqrt_ps(lij2B);
663 sk2_ajB = _mm_mul_ps(sk_ajB,sk_ajB);
664 sk2_rinvB = _mm_mul_ps(sk2_ajB,rinvB);
665 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
667 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
668 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
670 t1 = _mm_sub_ps(lij,uij);
671 t2 = _mm_mul_ps(diff2,
672 _mm_sub_ps(_mm_mul_ps(onefourth,r),
674 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
675 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
676 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
677 t4 = _mm_and_ps(t4,obc_mask3);
678 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
680 t1B = _mm_sub_ps(lijB,uijB);
681 t2B = _mm_mul_ps(diff2B,
682 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
684 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
685 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
686 t4B = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
687 t4B = _mm_and_ps(t4B,obc_mask3B);
688 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
690 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
692 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
693 _mm_mul_ps(prod,lij3));
695 _mm_mul_ps(onefourth,
696 _mm_add_ps(_mm_mul_ps(lij,rinv),
697 _mm_mul_ps(lij3,r))));
698 t2 = _mm_mul_ps(onefourth,
699 _mm_add_ps(_mm_mul_ps(uij,rinv),
700 _mm_mul_ps(uij3,r)));
702 _mm_add_ps(_mm_mul_ps(half,uij2),
703 _mm_mul_ps(prod,uij3)));
704 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
705 _mm_mul_ps(rinv,rinv));
707 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
709 _mm_mul_ps(sk2_rinv,rinv))));
710 t1 = _mm_mul_ps(rinv,
711 _mm_add_ps(_mm_mul_ps(dlij,t1),
716 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
717 _mm_mul_ps(prodB,lij3B));
718 t1B = _mm_sub_ps(t1B,
719 _mm_mul_ps(onefourth,
720 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
721 _mm_mul_ps(lij3B,rB))));
722 t2B = _mm_mul_ps(onefourth,
723 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
724 _mm_mul_ps(uij3B,rB)));
725 t2B = _mm_sub_ps(t2B,
726 _mm_add_ps(_mm_mul_ps(half,uij2B),
727 _mm_mul_ps(prodB,uij3B)));
728 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
729 _mm_mul_ps(rinvB,rinvB));
730 t3B = _mm_sub_ps(t3B,
731 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
733 _mm_mul_ps(sk2_rinvB,rinvB))));
734 t1B = _mm_mul_ps(rinvB,
735 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
736 _mm_add_ps(t2B,t3B)));
738 dadx1 = _mm_and_ps(t1,obc_mask1);
739 dadx1B = _mm_and_ps(t1B,obc_mask1B);
742 /* Evaluate influence of atom ai -> aj */
743 t1 = _mm_add_ps(r,sk_ai);
744 t2 = _mm_sub_ps(r,sk_ai);
745 t3 = _mm_sub_ps(sk_ai,r);
746 t1B = _mm_add_ps(rB,sk_ai);
747 t2B = _mm_sub_ps(rB,sk_ai);
748 t3B = _mm_sub_ps(sk_ai,rB);
749 obc_mask1 = _mm_cmplt_ps(raj, t1);
750 obc_mask2 = _mm_cmplt_ps(raj, t2);
751 obc_mask3 = _mm_cmplt_ps(raj, t3);
752 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
753 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
754 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
756 uij = gmx_mm_inv_ps(t1);
757 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
758 _mm_andnot_ps(obc_mask2,raj_inv));
759 dlij = _mm_and_ps(one,obc_mask2);
760 uij2 = _mm_mul_ps(uij, uij);
761 uij3 = _mm_mul_ps(uij2,uij);
762 lij2 = _mm_mul_ps(lij, lij);
763 lij3 = _mm_mul_ps(lij2,lij);
765 uijB = gmx_mm_inv_ps(t1B);
766 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
767 _mm_andnot_ps(obc_mask2B,raj_invB));
768 dlijB = _mm_and_ps(one,obc_mask2B);
769 uij2B = _mm_mul_ps(uijB, uijB);
770 uij3B = _mm_mul_ps(uij2B,uijB);
771 lij2B = _mm_mul_ps(lijB, lijB);
772 lij3B = _mm_mul_ps(lij2B,lijB);
774 diff2 = _mm_sub_ps(uij2,lij2);
775 lij_inv = gmx_mm_invsqrt_ps(lij2);
776 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
777 prod = _mm_mul_ps(onefourth,sk2_rinv);
779 diff2B = _mm_sub_ps(uij2B,lij2B);
780 lij_invB = gmx_mm_invsqrt_ps(lij2B);
781 sk2_rinvB = _mm_mul_ps(sk2_ai,rinvB);
782 prodB = _mm_mul_ps(onefourth,sk2_rinvB);
784 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
785 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
787 t1 = _mm_sub_ps(lij,uij);
788 t2 = _mm_mul_ps(diff2,
789 _mm_sub_ps(_mm_mul_ps(onefourth,r),
791 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
792 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
793 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
794 t4 = _mm_and_ps(t4,obc_mask3);
795 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
797 t1B = _mm_sub_ps(lijB,uijB);
798 t2B = _mm_mul_ps(diff2B,
799 _mm_sub_ps(_mm_mul_ps(onefourth,rB),
801 t3B = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
802 t1B = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
803 t4B = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
804 t4B = _mm_and_ps(t4B,obc_mask3B);
805 t1B = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
807 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
808 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
810 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
811 _mm_mul_ps(prod,lij3));
813 _mm_mul_ps(onefourth,
814 _mm_add_ps(_mm_mul_ps(lij,rinv),
815 _mm_mul_ps(lij3,r))));
816 t2 = _mm_mul_ps(onefourth,
817 _mm_add_ps(_mm_mul_ps(uij,rinv),
818 _mm_mul_ps(uij3,r)));
820 _mm_add_ps(_mm_mul_ps(half,uij2),
821 _mm_mul_ps(prod,uij3)));
822 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
823 _mm_mul_ps(rinv,rinv));
825 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
827 _mm_mul_ps(sk2_rinv,rinv))));
828 t1 = _mm_mul_ps(rinv,
829 _mm_add_ps(_mm_mul_ps(dlij,t1),
833 t1B = _mm_add_ps(_mm_mul_ps(half,lij2B),
834 _mm_mul_ps(prodB,lij3B));
835 t1B = _mm_sub_ps(t1B,
836 _mm_mul_ps(onefourth,
837 _mm_add_ps(_mm_mul_ps(lijB,rinvB),
838 _mm_mul_ps(lij3B,rB))));
839 t2B = _mm_mul_ps(onefourth,
840 _mm_add_ps(_mm_mul_ps(uijB,rinvB),
841 _mm_mul_ps(uij3B,rB)));
842 t2B = _mm_sub_ps(t2B,
843 _mm_add_ps(_mm_mul_ps(half,uij2B),
844 _mm_mul_ps(prodB,uij3B)));
845 t3B = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
846 _mm_mul_ps(rinvB,rinvB));
847 t3B = _mm_sub_ps(t3B,
848 _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
850 _mm_mul_ps(sk2_rinvB,rinvB))));
851 t1B = _mm_mul_ps(rinvB,
852 _mm_add_ps(_mm_mul_ps(dlijB,t1B),
853 _mm_add_ps(t2B,t3B)));
856 dadx2 = _mm_and_ps(t1,obc_mask1);
857 dadx2B = _mm_and_ps(t1B,obc_mask1B);
859 _mm_store_ps(dadx,dadx1);
861 _mm_store_ps(dadx,dadx2);
863 _mm_store_ps(dadx,dadx1B);
865 _mm_store_ps(dadx,dadx2B);
868 } /* end normal inner loop */
870 for(;k<nj1-offset;k+=4)
882 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
883 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
884 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
886 dx = _mm_sub_ps(ix, jx);
887 dy = _mm_sub_ps(iy, jy);
888 dz = _mm_sub_ps(iz, jz);
890 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
892 rinv = gmx_mm_invsqrt_ps(rsq);
893 r = _mm_mul_ps(rsq,rinv);
895 /* Compute raj_inv aj1-4 */
896 raj_inv = gmx_mm_inv_ps(raj);
898 /* Evaluate influence of atom aj -> ai */
899 t1 = _mm_add_ps(r,sk_aj);
900 obc_mask1 = _mm_cmplt_ps(rai, t1);
902 if(_mm_movemask_ps(obc_mask1))
904 /* If any of the elements has rai<dr+sk, this is executed */
905 t2 = _mm_sub_ps(r,sk_aj);
906 t3 = _mm_sub_ps(sk_aj,r);
908 obc_mask2 = _mm_cmplt_ps(rai, t2);
909 obc_mask3 = _mm_cmplt_ps(rai, t3);
911 uij = gmx_mm_inv_ps(t1);
912 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
913 _mm_andnot_ps(obc_mask2,rai_inv));
914 dlij = _mm_and_ps(one,obc_mask2);
915 uij2 = _mm_mul_ps(uij, uij);
916 uij3 = _mm_mul_ps(uij2,uij);
917 lij2 = _mm_mul_ps(lij, lij);
918 lij3 = _mm_mul_ps(lij2,lij);
919 diff2 = _mm_sub_ps(uij2,lij2);
920 lij_inv = gmx_mm_invsqrt_ps(lij2);
921 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
922 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
923 prod = _mm_mul_ps(onefourth,sk2_rinv);
924 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
925 t1 = _mm_sub_ps(lij,uij);
926 t2 = _mm_mul_ps(diff2,
927 _mm_sub_ps(_mm_mul_ps(onefourth,r),
929 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
930 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
931 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
932 t4 = _mm_and_ps(t4,obc_mask3);
933 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
934 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
935 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
936 _mm_mul_ps(prod,lij3));
938 _mm_mul_ps(onefourth,
939 _mm_add_ps(_mm_mul_ps(lij,rinv),
940 _mm_mul_ps(lij3,r))));
941 t2 = _mm_mul_ps(onefourth,
942 _mm_add_ps(_mm_mul_ps(uij,rinv),
943 _mm_mul_ps(uij3,r)));
945 _mm_add_ps(_mm_mul_ps(half,uij2),
946 _mm_mul_ps(prod,uij3)));
947 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
948 _mm_mul_ps(rinv,rinv));
950 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
952 _mm_mul_ps(sk2_rinv,rinv))));
953 t1 = _mm_mul_ps(rinv,
954 _mm_add_ps(_mm_mul_ps(dlij,t1),
957 dadx1 = _mm_and_ps(t1,obc_mask1);
961 dadx1 = _mm_setzero_ps();
964 /* Evaluate influence of atom ai -> aj */
965 t1 = _mm_add_ps(r,sk_ai);
966 obc_mask1 = _mm_cmplt_ps(raj, t1);
968 if(_mm_movemask_ps(obc_mask1))
970 t2 = _mm_sub_ps(r,sk_ai);
971 t3 = _mm_sub_ps(sk_ai,r);
972 obc_mask2 = _mm_cmplt_ps(raj, t2);
973 obc_mask3 = _mm_cmplt_ps(raj, t3);
975 uij = gmx_mm_inv_ps(t1);
976 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
977 _mm_andnot_ps(obc_mask2,raj_inv));
978 dlij = _mm_and_ps(one,obc_mask2);
979 uij2 = _mm_mul_ps(uij, uij);
980 uij3 = _mm_mul_ps(uij2,uij);
981 lij2 = _mm_mul_ps(lij, lij);
982 lij3 = _mm_mul_ps(lij2,lij);
983 diff2 = _mm_sub_ps(uij2,lij2);
984 lij_inv = gmx_mm_invsqrt_ps(lij2);
985 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
986 prod = _mm_mul_ps(onefourth,sk2_rinv);
987 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
988 t1 = _mm_sub_ps(lij,uij);
989 t2 = _mm_mul_ps(diff2,
990 _mm_sub_ps(_mm_mul_ps(onefourth,r),
992 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
993 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
994 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
995 t4 = _mm_and_ps(t4,obc_mask3);
996 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
998 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
1000 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1001 _mm_mul_ps(prod,lij3));
1003 _mm_mul_ps(onefourth,
1004 _mm_add_ps(_mm_mul_ps(lij,rinv),
1005 _mm_mul_ps(lij3,r))));
1006 t2 = _mm_mul_ps(onefourth,
1007 _mm_add_ps(_mm_mul_ps(uij,rinv),
1008 _mm_mul_ps(uij3,r)));
1010 _mm_add_ps(_mm_mul_ps(half,uij2),
1011 _mm_mul_ps(prod,uij3)));
1012 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1013 _mm_mul_ps(rinv,rinv));
1015 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1017 _mm_mul_ps(sk2_rinv,rinv))));
1018 t1 = _mm_mul_ps(rinv,
1019 _mm_add_ps(_mm_mul_ps(dlij,t1),
1020 _mm_add_ps(t2,t3)));
1021 dadx2 = _mm_and_ps(t1,obc_mask1);
1025 dadx2 = _mm_setzero_ps();
1028 _mm_store_ps(dadx,dadx1);
1030 _mm_store_ps(dadx,dadx2);
1032 } /* end normal inner loop */
1040 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1041 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1042 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1051 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1052 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1053 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1058 /* offset must be 3 */
1065 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1066 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1067 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1071 dx = _mm_sub_ps(ix, jx);
1072 dy = _mm_sub_ps(iy, jy);
1073 dz = _mm_sub_ps(iz, jz);
1075 rsq = gmx_mm_calc_rsq_ps(dx,dy,dz);
1077 rinv = gmx_mm_invsqrt_ps(rsq);
1078 r = _mm_mul_ps(rsq,rinv);
1080 /* Compute raj_inv aj1-4 */
1081 raj_inv = gmx_mm_inv_ps(raj);
1083 /* Evaluate influence of atom aj -> ai */
1084 t1 = _mm_add_ps(r,sk_aj);
1085 obc_mask1 = _mm_cmplt_ps(rai, t1);
1086 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1088 if(_mm_movemask_ps(obc_mask1))
1090 t2 = _mm_sub_ps(r,sk_aj);
1091 t3 = _mm_sub_ps(sk_aj,r);
1092 obc_mask2 = _mm_cmplt_ps(rai, t2);
1093 obc_mask3 = _mm_cmplt_ps(rai, t3);
1095 uij = gmx_mm_inv_ps(t1);
1096 lij = _mm_or_ps( _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1097 _mm_andnot_ps(obc_mask2,rai_inv));
1098 dlij = _mm_and_ps(one,obc_mask2);
1099 uij2 = _mm_mul_ps(uij, uij);
1100 uij3 = _mm_mul_ps(uij2,uij);
1101 lij2 = _mm_mul_ps(lij, lij);
1102 lij3 = _mm_mul_ps(lij2,lij);
1103 diff2 = _mm_sub_ps(uij2,lij2);
1104 lij_inv = gmx_mm_invsqrt_ps(lij2);
1105 sk2_aj = _mm_mul_ps(sk_aj,sk_aj);
1106 sk2_rinv = _mm_mul_ps(sk2_aj,rinv);
1107 prod = _mm_mul_ps(onefourth,sk2_rinv);
1108 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1109 t1 = _mm_sub_ps(lij,uij);
1110 t2 = _mm_mul_ps(diff2,
1111 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1113 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1114 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1115 t4 = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1116 t4 = _mm_and_ps(t4,obc_mask3);
1117 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1118 sum_ai = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1119 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1120 _mm_mul_ps(prod,lij3));
1122 _mm_mul_ps(onefourth,
1123 _mm_add_ps(_mm_mul_ps(lij,rinv),
1124 _mm_mul_ps(lij3,r))));
1125 t2 = _mm_mul_ps(onefourth,
1126 _mm_add_ps(_mm_mul_ps(uij,rinv),
1127 _mm_mul_ps(uij3,r)));
1129 _mm_add_ps(_mm_mul_ps(half,uij2),
1130 _mm_mul_ps(prod,uij3)));
1131 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1132 _mm_mul_ps(rinv,rinv));
1134 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1136 _mm_mul_ps(sk2_rinv,rinv))));
1137 t1 = _mm_mul_ps(rinv,
1138 _mm_add_ps(_mm_mul_ps(dlij,t1),
1139 _mm_add_ps(t2,t3)));
1140 dadx1 = _mm_and_ps(t1,obc_mask1);
1144 dadx1 = _mm_setzero_ps();
1147 /* Evaluate influence of atom ai -> aj */
1148 t1 = _mm_add_ps(r,sk_ai);
1149 obc_mask1 = _mm_cmplt_ps(raj, t1);
1150 obc_mask1 = _mm_and_ps(obc_mask1,mask);
1152 if(_mm_movemask_ps(obc_mask1))
1154 t2 = _mm_sub_ps(r,sk_ai);
1155 t3 = _mm_sub_ps(sk_ai,r);
1156 obc_mask2 = _mm_cmplt_ps(raj, t2);
1157 obc_mask3 = _mm_cmplt_ps(raj, t3);
1159 uij = gmx_mm_inv_ps(t1);
1160 lij = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1161 _mm_andnot_ps(obc_mask2,raj_inv));
1162 dlij = _mm_and_ps(one,obc_mask2);
1163 uij2 = _mm_mul_ps(uij, uij);
1164 uij3 = _mm_mul_ps(uij2,uij);
1165 lij2 = _mm_mul_ps(lij, lij);
1166 lij3 = _mm_mul_ps(lij2,lij);
1167 diff2 = _mm_sub_ps(uij2,lij2);
1168 lij_inv = gmx_mm_invsqrt_ps(lij2);
1169 sk2_rinv = _mm_mul_ps(sk2_ai,rinv);
1170 prod = _mm_mul_ps(onefourth,sk2_rinv);
1171 logterm = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1172 t1 = _mm_sub_ps(lij,uij);
1173 t2 = _mm_mul_ps(diff2,
1174 _mm_sub_ps(_mm_mul_ps(onefourth,r),
1176 t3 = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1177 t1 = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1178 t4 = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1179 t4 = _mm_and_ps(t4,obc_mask3);
1180 t1 = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1182 tmp = _mm_and_ps(t1,obc_mask1);
1184 t1 = _mm_add_ps(_mm_mul_ps(half,lij2),
1185 _mm_mul_ps(prod,lij3));
1187 _mm_mul_ps(onefourth,
1188 _mm_add_ps(_mm_mul_ps(lij,rinv),
1189 _mm_mul_ps(lij3,r))));
1190 t2 = _mm_mul_ps(onefourth,
1191 _mm_add_ps(_mm_mul_ps(uij,rinv),
1192 _mm_mul_ps(uij3,r)));
1194 _mm_add_ps(_mm_mul_ps(half,uij2),
1195 _mm_mul_ps(prod,uij3)));
1196 t3 = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1197 _mm_mul_ps(rinv,rinv));
1199 _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1201 _mm_mul_ps(sk2_rinv,rinv))));
1202 t1 = _mm_mul_ps(rinv,
1203 _mm_add_ps(_mm_mul_ps(dlij,t1),
1204 _mm_add_ps(t2,t3)));
1205 dadx2 = _mm_and_ps(t1,obc_mask1);
1209 dadx2 = _mm_setzero_ps();
1210 tmp = _mm_setzero_ps();
1213 _mm_store_ps(dadx,dadx1);
1215 _mm_store_ps(dadx,dadx2);
1220 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1224 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1228 /* offset must be 3 */
1229 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1233 GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1237 /* Parallel summations */
1240 gmx_sum(natoms, work, cr);
1242 else if(DOMAINDECOMP(cr))
1244 dd_atom_sum_real(cr->dd, work);
1247 if(gb_algorithm==egbHCT)
1250 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1252 if(born->use[i] != 0)
1254 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1255 sum = 1.0/rr - work[i];
1256 min_rad = rr + doffset;
1259 born->bRad[i] = rad > min_rad ? rad : min_rad;
1260 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1264 /* Extra communication required for DD */
1265 if(DOMAINDECOMP(cr))
1267 dd_atom_spread_real(cr->dd, born->bRad);
1268 dd_atom_spread_real(cr->dd, fr->invsqrta);
1274 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1276 if(born->use[i] != 0)
1278 rr = top->atomtypes.gb_radius[md->typeA[i]];
1286 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1287 born->bRad[i] = rr_inv - tsum*rr_inv2;
1288 born->bRad[i] = 1.0 / born->bRad[i];
1290 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1292 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1293 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1296 /* Extra (local) communication required for DD */
1297 if(DOMAINDECOMP(cr))
1299 dd_atom_spread_real(cr->dd, born->bRad);
1300 dd_atom_spread_real(cr->dd, fr->invsqrta);
1301 dd_atom_spread_real(cr->dd, born->drobc);
1312 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1313 float *x, float *f, float *fshift, float *shiftvec,
1314 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1316 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1317 int jnrA,jnrB,jnrC,jnrD;
1318 int j3A,j3B,j3C,j3D;
1319 int jnrE,jnrF,jnrG,jnrH;
1320 int j3E,j3F,j3G,j3H;
1323 float rbi,shX,shY,shZ;
1335 __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1336 __m128 xmm1,xmm2,xmm3;
1338 const __m128 two = _mm_set1_ps(2.0f);
1344 /* Loop to get the proper form for the Born radius term, sse style */
1350 if(gb_algorithm==egbSTILL)
1354 rbi = born->bRad[i];
1355 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1358 else if(gb_algorithm==egbHCT)
1362 rbi = born->bRad[i];
1363 rb[i] = rbi * rbi * dvda[i];
1366 else if(gb_algorithm==egbOBC)
1370 rbi = born->bRad[i];
1371 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1375 jz = _mm_setzero_ps();
1377 n = j3A = j3B = j3C = j3D = 0;
1379 for(i=0;i<nl->nri;i++)
1383 is3 = 3*nl->shift[i];
1384 shX = shiftvec[is3];
1385 shY = shiftvec[is3+1];
1386 shZ = shiftvec[is3+2];
1387 nj0 = nl->jindex[i];
1388 nj1 = nl->jindex[i+1];
1390 ix = _mm_set1_ps(shX+x[ii3+0]);
1391 iy = _mm_set1_ps(shY+x[ii3+1]);
1392 iz = _mm_set1_ps(shZ+x[ii3+2]);
1394 offset = (nj1-nj0)%4;
1396 rbai = _mm_load1_ps(rb+ii);
1397 fix = _mm_setzero_ps();
1398 fiy = _mm_setzero_ps();
1399 fiz = _mm_setzero_ps();
1402 for(k=nj0;k<nj1-offset;k+=4)
1414 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1416 dx = _mm_sub_ps(ix,jx);
1417 dy = _mm_sub_ps(iy,jy);
1418 dz = _mm_sub_ps(iz,jz);
1420 GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1422 /* load chain rule terms for j1-4 */
1423 f_gb = _mm_load_ps(dadx);
1425 f_gb_ai = _mm_load_ps(dadx);
1428 /* calculate scalar force */
1429 f_gb = _mm_mul_ps(f_gb,rbai);
1430 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1431 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1433 tx = _mm_mul_ps(f_gb,dx);
1434 ty = _mm_mul_ps(f_gb,dy);
1435 tz = _mm_mul_ps(f_gb,dz);
1437 fix = _mm_add_ps(fix,tx);
1438 fiy = _mm_add_ps(fiy,ty);
1439 fiz = _mm_add_ps(fiz,tz);
1441 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1444 /*deal with odd elements */
1451 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1452 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1460 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1461 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1465 /* offset must be 3 */
1472 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1473 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1476 dx = _mm_sub_ps(ix,jx);
1477 dy = _mm_sub_ps(iy,jy);
1478 dz = _mm_sub_ps(iz,jz);
1480 /* load chain rule terms for j1-4 */
1481 f_gb = _mm_load_ps(dadx);
1483 f_gb_ai = _mm_load_ps(dadx);
1486 /* calculate scalar force */
1487 f_gb = _mm_mul_ps(f_gb,rbai);
1488 f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1489 f_gb = _mm_add_ps(f_gb,f_gb_ai);
1491 tx = _mm_mul_ps(f_gb,dx);
1492 ty = _mm_mul_ps(f_gb,dy);
1493 tz = _mm_mul_ps(f_gb,dz);
1495 fix = _mm_add_ps(fix,tx);
1496 fiy = _mm_add_ps(fiy,ty);
1497 fiz = _mm_add_ps(fiz,tz);
1501 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1505 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1509 /* offset must be 3 */
1510 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1514 /* fix/fiy/fiz now contain four partial force terms, that all should be
1515 * added to the i particle forces and shift forces.
1517 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1525 /* keep compiler happy */
1526 int genborn_sse_dummy;
1528 #endif /* SSE intrinsics available */