2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
56 #include "gmx_fatal.h"
57 #include "mtop_util.h"
67 /* Only compile this file if SSE2 intrinsics are available */
68 #if 0 && defined (GMX_X86_SSE2)
69 #include <gmx_sse2_double.h>
70 #include <emmintrin.h>
72 #include "genborn_sse2_double.h"
75 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
76 int natoms, gmx_localtop_t *top,
77 const t_atomtypes *atype, double *x, t_nblist *nl,
80 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
81 int jnrA,jnrB,j3A,j3B;
98 __m128d rsq,rinv,rinv2,rinv4,rinv6;
99 __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
100 __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
101 __m128d mask,icf4,icf6,mask_cmp;
103 const __m128d half = _mm_set1_pd(0.5);
104 const __m128d three = _mm_set1_pd(3.0);
105 const __m128d one = _mm_set1_pd(1.0);
106 const __m128d two = _mm_set1_pd(2.0);
107 const __m128d zero = _mm_set1_pd(0.0);
108 const __m128d four = _mm_set1_pd(4.0);
110 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
111 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
112 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
114 factor = 0.5 * ONE_4PI_EPS0;
116 gb_radius = born->gb_radius;
118 work = born->gpol_still_work;
120 shiftvec = fr->shift_vec[0];
124 jx = _mm_setzero_pd();
125 jy = _mm_setzero_pd();
126 jz = _mm_setzero_pd();
130 for(i=0;i<natoms;i++)
135 for(i=0;i<nl->nri;i++)
139 is3 = 3*nl->shift[i];
141 shY = shiftvec[is3+1];
142 shZ = shiftvec[is3+2];
144 nj1 = nl->jindex[i+1];
146 ix = _mm_set1_pd(shX+x[ii3+0]);
147 iy = _mm_set1_pd(shY+x[ii3+1]);
148 iz = _mm_set1_pd(shZ+x[ii3+2]);
151 /* Polarization energy for atom ai */
152 gpi = _mm_setzero_pd();
154 rai = _mm_load1_pd(gb_radius+ii);
155 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
157 for(k=nj0;k<nj1-1;k+=2)
165 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
167 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
168 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
170 dx = _mm_sub_pd(ix,jx);
171 dy = _mm_sub_pd(iy,jy);
172 dz = _mm_sub_pd(iz,jz);
174 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
175 rinv = gmx_mm_invsqrt_pd(rsq);
176 rinv2 = _mm_mul_pd(rinv,rinv);
177 rinv4 = _mm_mul_pd(rinv2,rinv2);
178 rinv6 = _mm_mul_pd(rinv4,rinv2);
180 rvdw = _mm_add_pd(rai,raj);
181 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
183 mask_cmp = _mm_cmple_pd(ratio,still_p5inv);
185 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
186 if( 0 == _mm_movemask_pd(mask_cmp) )
188 /* if ratio>still_p5inv for ALL elements */
190 dccf = _mm_setzero_pd();
194 ratio = _mm_min_pd(ratio,still_p5inv);
195 theta = _mm_mul_pd(ratio,still_pip5);
196 gmx_mm_sincos_pd(theta,&sinq,&cosq);
197 term = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
198 ccf = _mm_mul_pd(term,term);
199 dccf = _mm_mul_pd(_mm_mul_pd(two,term),
200 _mm_mul_pd(sinq,theta));
203 prod = _mm_mul_pd(still_p4,vaj);
204 icf4 = _mm_mul_pd(ccf,rinv4);
205 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
207 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
209 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
211 _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
213 _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
223 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
225 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
226 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
228 dx = _mm_sub_sd(ix,jx);
229 dy = _mm_sub_sd(iy,jy);
230 dz = _mm_sub_sd(iz,jz);
232 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
233 rinv = gmx_mm_invsqrt_pd(rsq);
234 rinv2 = _mm_mul_sd(rinv,rinv);
235 rinv4 = _mm_mul_sd(rinv2,rinv2);
236 rinv6 = _mm_mul_sd(rinv4,rinv2);
238 rvdw = _mm_add_sd(rai,raj);
239 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
241 mask_cmp = _mm_cmple_sd(ratio,still_p5inv);
243 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
244 if( 0 == _mm_movemask_pd(mask_cmp) )
246 /* if ratio>still_p5inv for ALL elements */
248 dccf = _mm_setzero_pd();
252 ratio = _mm_min_sd(ratio,still_p5inv);
253 theta = _mm_mul_sd(ratio,still_pip5);
254 gmx_mm_sincos_pd(theta,&sinq,&cosq);
255 term = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
256 ccf = _mm_mul_sd(term,term);
257 dccf = _mm_mul_sd(_mm_mul_sd(two,term),
258 _mm_mul_sd(sinq,theta));
261 prod = _mm_mul_sd(still_p4,vaj);
262 icf4 = _mm_mul_sd(ccf,rinv4);
263 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
265 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
267 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
269 _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
271 _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
274 gmx_mm_update_1pot_pd(gpi,work+ii);
277 /* Sum up the polarization energy from other nodes */
280 gmx_sum(natoms, work, cr);
282 else if(DOMAINDECOMP(cr))
284 dd_atom_sum_real(cr->dd, work);
287 /* Compute the radii */
288 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
290 if(born->use[i] != 0)
292 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
293 gpi2 = gpi_ai * gpi_ai;
294 born->bRad[i] = factor*gmx_invsqrt(gpi2);
295 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
299 /* Extra (local) communication required for DD */
302 dd_atom_spread_real(cr->dd, born->bRad);
303 dd_atom_spread_real(cr->dd, fr->invsqrta);
311 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
312 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
314 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
318 double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
319 double sum_ai2, sum_ai3,tsum,tchain,doffset;
328 __m128d ix,iy,iz,jx,jy,jz;
329 __m128d dx,dy,dz,t1,t2,t3,t4;
331 __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
332 __m128d uij,lij2,uij2,lij3,uij3,diff2;
333 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
334 __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
338 __m128d obc_mask1,obc_mask2,obc_mask3;
340 __m128d oneeighth = _mm_set1_pd(0.125);
341 __m128d onefourth = _mm_set1_pd(0.25);
343 const __m128d half = _mm_set1_pd(0.5);
344 const __m128d three = _mm_set1_pd(3.0);
345 const __m128d one = _mm_set1_pd(1.0);
346 const __m128d two = _mm_set1_pd(2.0);
347 const __m128d zero = _mm_set1_pd(0.0);
348 const __m128d neg = _mm_set1_pd(-1.0);
350 /* Set the dielectric offset */
351 doffset = born->gb_doffset;
352 gb_radius = born->gb_radius;
353 obc_param = born->param;
354 work = born->gpol_hct_work;
357 shiftvec = fr->shift_vec[0];
359 jx = _mm_setzero_pd();
360 jy = _mm_setzero_pd();
361 jz = _mm_setzero_pd();
365 for(i=0;i<born->nr;i++)
370 for(i=0;i<nl->nri;i++)
374 is3 = 3*nl->shift[i];
376 shY = shiftvec[is3+1];
377 shZ = shiftvec[is3+2];
379 nj1 = nl->jindex[i+1];
381 ix = _mm_set1_pd(shX+x[ii3+0]);
382 iy = _mm_set1_pd(shY+x[ii3+1]);
383 iz = _mm_set1_pd(shZ+x[ii3+2]);
385 rai = _mm_load1_pd(gb_radius+ii);
386 rai_inv= gmx_mm_inv_pd(rai);
388 sum_ai = _mm_setzero_pd();
390 sk_ai = _mm_load1_pd(born->param+ii);
391 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
393 for(k=nj0;k<nj1-1;k+=2)
401 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
402 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
403 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
405 dx = _mm_sub_pd(ix, jx);
406 dy = _mm_sub_pd(iy, jy);
407 dz = _mm_sub_pd(iz, jz);
409 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
411 rinv = gmx_mm_invsqrt_pd(rsq);
412 r = _mm_mul_pd(rsq,rinv);
414 /* Compute raj_inv aj1-4 */
415 raj_inv = gmx_mm_inv_pd(raj);
417 /* Evaluate influence of atom aj -> ai */
418 t1 = _mm_add_pd(r,sk_aj);
419 t2 = _mm_sub_pd(r,sk_aj);
420 t3 = _mm_sub_pd(sk_aj,r);
421 obc_mask1 = _mm_cmplt_pd(rai, t1);
422 obc_mask2 = _mm_cmplt_pd(rai, t2);
423 obc_mask3 = _mm_cmplt_pd(rai, t3);
425 uij = gmx_mm_inv_pd(t1);
426 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
427 _mm_andnot_pd(obc_mask2,rai_inv));
428 dlij = _mm_and_pd(one,obc_mask2);
429 uij2 = _mm_mul_pd(uij, uij);
430 uij3 = _mm_mul_pd(uij2,uij);
431 lij2 = _mm_mul_pd(lij, lij);
432 lij3 = _mm_mul_pd(lij2,lij);
434 diff2 = _mm_sub_pd(uij2,lij2);
435 lij_inv = gmx_mm_invsqrt_pd(lij2);
436 sk2_aj = _mm_mul_pd(sk_aj,sk_aj);
437 sk2_rinv = _mm_mul_pd(sk2_aj,rinv);
438 prod = _mm_mul_pd(onefourth,sk2_rinv);
440 logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
442 t1 = _mm_sub_pd(lij,uij);
443 t2 = _mm_mul_pd(diff2,
444 _mm_sub_pd(_mm_mul_pd(onefourth,r),
446 t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
447 t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3));
448 t4 = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
449 t4 = _mm_and_pd(t4,obc_mask3);
450 t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4));
452 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
454 t1 = _mm_add_pd(_mm_mul_pd(half,lij2),
455 _mm_mul_pd(prod,lij3));
457 _mm_mul_pd(onefourth,
458 _mm_add_pd(_mm_mul_pd(lij,rinv),
459 _mm_mul_pd(lij3,r))));
460 t2 = _mm_mul_pd(onefourth,
461 _mm_add_pd(_mm_mul_pd(uij,rinv),
462 _mm_mul_pd(uij3,r)));
464 _mm_add_pd(_mm_mul_pd(half,uij2),
465 _mm_mul_pd(prod,uij3)));
466 t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
467 _mm_mul_pd(rinv,rinv));
469 _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
471 _mm_mul_pd(sk2_rinv,rinv))));
472 t1 = _mm_mul_pd(rinv,
473 _mm_add_pd(_mm_mul_pd(dlij,t1),
476 dadx1 = _mm_and_pd(t1,obc_mask1);
478 /* Evaluate influence of atom ai -> aj */
479 t1 = _mm_add_pd(r,sk_ai);
480 t2 = _mm_sub_pd(r,sk_ai);
481 t3 = _mm_sub_pd(sk_ai,r);
482 obc_mask1 = _mm_cmplt_pd(raj, t1);
483 obc_mask2 = _mm_cmplt_pd(raj, t2);
484 obc_mask3 = _mm_cmplt_pd(raj, t3);
486 uij = gmx_mm_inv_pd(t1);
487 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
488 _mm_andnot_pd(obc_mask2,raj_inv));
489 dlij = _mm_and_pd(one,obc_mask2);
490 uij2 = _mm_mul_pd(uij, uij);
491 uij3 = _mm_mul_pd(uij2,uij);
492 lij2 = _mm_mul_pd(lij, lij);
493 lij3 = _mm_mul_pd(lij2,lij);
495 diff2 = _mm_sub_pd(uij2,lij2);
496 lij_inv = gmx_mm_invsqrt_pd(lij2);
497 sk2_rinv = _mm_mul_pd(sk2_ai,rinv);
498 prod = _mm_mul_pd(onefourth,sk2_rinv);
500 logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
502 t1 = _mm_sub_pd(lij,uij);
503 t2 = _mm_mul_pd(diff2,
504 _mm_sub_pd(_mm_mul_pd(onefourth,r),
506 t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
507 t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3));
508 t4 = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
509 t4 = _mm_and_pd(t4,obc_mask3);
510 t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4));
512 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
514 t1 = _mm_add_pd(_mm_mul_pd(half,lij2),
515 _mm_mul_pd(prod,lij3));
517 _mm_mul_pd(onefourth,
518 _mm_add_pd(_mm_mul_pd(lij,rinv),
519 _mm_mul_pd(lij3,r))));
520 t2 = _mm_mul_pd(onefourth,
521 _mm_add_pd(_mm_mul_pd(uij,rinv),
522 _mm_mul_pd(uij3,r)));
524 _mm_add_pd(_mm_mul_pd(half,uij2),
525 _mm_mul_pd(prod,uij3)));
526 t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
527 _mm_mul_pd(rinv,rinv));
529 _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
531 _mm_mul_pd(sk2_rinv,rinv))));
532 t1 = _mm_mul_pd(rinv,
533 _mm_add_pd(_mm_mul_pd(dlij,t1),
536 dadx2 = _mm_and_pd(t1,obc_mask1);
538 _mm_store_pd(dadx,dadx1);
540 _mm_store_pd(dadx,dadx2);
542 } /* end normal inner loop */
550 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
551 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
552 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
554 dx = _mm_sub_sd(ix, jx);
555 dy = _mm_sub_sd(iy, jy);
556 dz = _mm_sub_sd(iz, jz);
558 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
560 rinv = gmx_mm_invsqrt_pd(rsq);
561 r = _mm_mul_sd(rsq,rinv);
563 /* Compute raj_inv aj1-4 */
564 raj_inv = gmx_mm_inv_pd(raj);
566 /* Evaluate influence of atom aj -> ai */
567 t1 = _mm_add_sd(r,sk_aj);
568 t2 = _mm_sub_sd(r,sk_aj);
569 t3 = _mm_sub_sd(sk_aj,r);
570 obc_mask1 = _mm_cmplt_sd(rai, t1);
571 obc_mask2 = _mm_cmplt_sd(rai, t2);
572 obc_mask3 = _mm_cmplt_sd(rai, t3);
574 uij = gmx_mm_inv_pd(t1);
575 lij = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
576 _mm_andnot_pd(obc_mask2,rai_inv));
577 dlij = _mm_and_pd(one,obc_mask2);
578 uij2 = _mm_mul_sd(uij, uij);
579 uij3 = _mm_mul_sd(uij2,uij);
580 lij2 = _mm_mul_sd(lij, lij);
581 lij3 = _mm_mul_sd(lij2,lij);
583 diff2 = _mm_sub_sd(uij2,lij2);
584 lij_inv = gmx_mm_invsqrt_pd(lij2);
585 sk2_aj = _mm_mul_sd(sk_aj,sk_aj);
586 sk2_rinv = _mm_mul_sd(sk2_aj,rinv);
587 prod = _mm_mul_sd(onefourth,sk2_rinv);
589 logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
591 t1 = _mm_sub_sd(lij,uij);
592 t2 = _mm_mul_sd(diff2,
593 _mm_sub_sd(_mm_mul_pd(onefourth,r),
595 t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
596 t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3));
597 t4 = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
598 t4 = _mm_and_pd(t4,obc_mask3);
599 t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4));
601 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
603 t1 = _mm_add_sd(_mm_mul_sd(half,lij2),
604 _mm_mul_sd(prod,lij3));
606 _mm_mul_sd(onefourth,
607 _mm_add_sd(_mm_mul_sd(lij,rinv),
608 _mm_mul_sd(lij3,r))));
609 t2 = _mm_mul_sd(onefourth,
610 _mm_add_sd(_mm_mul_sd(uij,rinv),
611 _mm_mul_sd(uij3,r)));
613 _mm_add_sd(_mm_mul_sd(half,uij2),
614 _mm_mul_sd(prod,uij3)));
615 t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
616 _mm_mul_sd(rinv,rinv));
618 _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
620 _mm_mul_sd(sk2_rinv,rinv))));
621 t1 = _mm_mul_sd(rinv,
622 _mm_add_sd(_mm_mul_sd(dlij,t1),
625 dadx1 = _mm_and_pd(t1,obc_mask1);
627 /* Evaluate influence of atom ai -> aj */
628 t1 = _mm_add_sd(r,sk_ai);
629 t2 = _mm_sub_sd(r,sk_ai);
630 t3 = _mm_sub_sd(sk_ai,r);
631 obc_mask1 = _mm_cmplt_sd(raj, t1);
632 obc_mask2 = _mm_cmplt_sd(raj, t2);
633 obc_mask3 = _mm_cmplt_sd(raj, t3);
635 uij = gmx_mm_inv_pd(t1);
636 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
637 _mm_andnot_pd(obc_mask2,raj_inv));
638 dlij = _mm_and_pd(one,obc_mask2);
639 uij2 = _mm_mul_sd(uij, uij);
640 uij3 = _mm_mul_sd(uij2,uij);
641 lij2 = _mm_mul_sd(lij, lij);
642 lij3 = _mm_mul_sd(lij2,lij);
644 diff2 = _mm_sub_sd(uij2,lij2);
645 lij_inv = gmx_mm_invsqrt_pd(lij2);
646 sk2_rinv = _mm_mul_sd(sk2_ai,rinv);
647 prod = _mm_mul_sd(onefourth,sk2_rinv);
649 logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
651 t1 = _mm_sub_sd(lij,uij);
652 t2 = _mm_mul_sd(diff2,
653 _mm_sub_sd(_mm_mul_sd(onefourth,r),
655 t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
656 t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3));
657 t4 = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
658 t4 = _mm_and_pd(t4,obc_mask3);
659 t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4));
661 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
663 t1 = _mm_add_sd(_mm_mul_sd(half,lij2),
664 _mm_mul_sd(prod,lij3));
666 _mm_mul_sd(onefourth,
667 _mm_add_sd(_mm_mul_sd(lij,rinv),
668 _mm_mul_sd(lij3,r))));
669 t2 = _mm_mul_sd(onefourth,
670 _mm_add_sd(_mm_mul_sd(uij,rinv),
671 _mm_mul_sd(uij3,r)));
673 _mm_add_sd(_mm_mul_sd(half,uij2),
674 _mm_mul_sd(prod,uij3)));
675 t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
676 _mm_mul_sd(rinv,rinv));
678 _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
680 _mm_mul_sd(sk2_rinv,rinv))));
681 t1 = _mm_mul_sd(rinv,
682 _mm_add_sd(_mm_mul_sd(dlij,t1),
685 dadx2 = _mm_and_pd(t1,obc_mask1);
687 _mm_store_pd(dadx,dadx1);
689 _mm_store_pd(dadx,dadx2);
692 gmx_mm_update_1pot_pd(sum_ai,work+ii);
696 /* Parallel summations */
699 gmx_sum(natoms, work, cr);
701 else if(DOMAINDECOMP(cr))
703 dd_atom_sum_real(cr->dd, work);
706 if(gb_algorithm==egbHCT)
709 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
711 if(born->use[i] != 0)
713 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
714 sum = 1.0/rr - work[i];
715 min_rad = rr + doffset;
718 born->bRad[i] = rad > min_rad ? rad : min_rad;
719 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
723 /* Extra communication required for DD */
726 dd_atom_spread_real(cr->dd, born->bRad);
727 dd_atom_spread_real(cr->dd, fr->invsqrta);
733 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
735 if(born->use[i] != 0)
737 rr = top->atomtypes.gb_radius[md->typeA[i]];
745 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
746 born->bRad[i] = rr_inv - tsum*rr_inv2;
747 born->bRad[i] = 1.0 / born->bRad[i];
749 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
751 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
752 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
755 /* Extra (local) communication required for DD */
758 dd_atom_spread_real(cr->dd, born->bRad);
759 dd_atom_spread_real(cr->dd, fr->invsqrta);
760 dd_atom_spread_real(cr->dd, born->drobc);
771 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
772 double *x, double *f, double *fshift, double *shiftvec,
773 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
775 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
780 double rbi,shX,shY,shZ;
789 __m128d rbai,rbaj,f_gb, f_gb_ai;
790 __m128d xmm1,xmm2,xmm3;
792 const __m128d two = _mm_set1_pd(2.0);
798 /* Loop to get the proper form for the Born radius term, sse style */
802 if(gb_algorithm==egbSTILL)
807 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
810 else if(gb_algorithm==egbHCT)
815 rb[i] = rbi * rbi * dvda[i];
818 else if(gb_algorithm==egbOBC)
823 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
827 jz = _mm_setzero_pd();
831 for(i=0;i<nl->nri;i++)
835 is3 = 3*nl->shift[i];
837 shY = shiftvec[is3+1];
838 shZ = shiftvec[is3+2];
840 nj1 = nl->jindex[i+1];
842 ix = _mm_set1_pd(shX+x[ii3+0]);
843 iy = _mm_set1_pd(shY+x[ii3+1]);
844 iz = _mm_set1_pd(shZ+x[ii3+2]);
846 rbai = _mm_load1_pd(rb+ii);
847 fix = _mm_setzero_pd();
848 fiy = _mm_setzero_pd();
849 fiz = _mm_setzero_pd();
852 for(k=nj0;k<nj1-1;k+=2)
860 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
862 dx = _mm_sub_pd(ix,jx);
863 dy = _mm_sub_pd(iy,jy);
864 dz = _mm_sub_pd(iz,jz);
866 GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
868 /* load chain rule terms for j1-4 */
869 f_gb = _mm_load_pd(dadx);
871 f_gb_ai = _mm_load_pd(dadx);
874 /* calculate scalar force */
875 f_gb = _mm_mul_pd(f_gb,rbai);
876 f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
877 f_gb = _mm_add_pd(f_gb,f_gb_ai);
879 tx = _mm_mul_pd(f_gb,dx);
880 ty = _mm_mul_pd(f_gb,dy);
881 tz = _mm_mul_pd(f_gb,dz);
883 fix = _mm_add_pd(fix,tx);
884 fiy = _mm_add_pd(fiy,ty);
885 fiz = _mm_add_pd(fiz,tz);
887 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
890 /*deal with odd elements */
896 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
898 dx = _mm_sub_sd(ix,jx);
899 dy = _mm_sub_sd(iy,jy);
900 dz = _mm_sub_sd(iz,jz);
902 GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
904 /* load chain rule terms */
905 f_gb = _mm_load_pd(dadx);
907 f_gb_ai = _mm_load_pd(dadx);
910 /* calculate scalar force */
911 f_gb = _mm_mul_sd(f_gb,rbai);
912 f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
913 f_gb = _mm_add_sd(f_gb,f_gb_ai);
915 tx = _mm_mul_sd(f_gb,dx);
916 ty = _mm_mul_sd(f_gb,dy);
917 tz = _mm_mul_sd(f_gb,dz);
919 fix = _mm_add_sd(fix,tx);
920 fiy = _mm_add_sd(fiy,ty);
921 fiz = _mm_add_sd(fiz,tz);
923 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
926 /* fix/fiy/fiz now contain four partial force terms, that all should be
927 * added to the i particle forces and shift forces.
929 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
936 /* keep compiler happy */
937 int genborn_sse2_dummy;
939 #endif /* SSE2 intrinsics available */