1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2008, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Groningen Machine for Chemical Simulation
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
64 /* Only compile this file if SSE2 intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
66 #include <gmx_sse2_double.h>
67 #include <xmmintrin.h>
68 #include <emmintrin.h>
70 #include "genborn_sse2_double.h"
73 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
74 int natoms, gmx_localtop_t *top,
75 const t_atomtypes *atype, double *x, t_nblist *nl,
78 int i,k,n,ii,is3,ii3,nj0,nj1,offset;
79 int jnrA,jnrB,j3A,j3B;
96 __m128d rsq,rinv,rinv2,rinv4,rinv6;
97 __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
98 __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
99 __m128d mask,icf4,icf6,mask_cmp;
101 const __m128d half = _mm_set1_pd(0.5);
102 const __m128d three = _mm_set1_pd(3.0);
103 const __m128d one = _mm_set1_pd(1.0);
104 const __m128d two = _mm_set1_pd(2.0);
105 const __m128d zero = _mm_set1_pd(0.0);
106 const __m128d four = _mm_set1_pd(4.0);
108 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
109 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
110 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
112 factor = 0.5 * ONE_4PI_EPS0;
114 gb_radius = born->gb_radius;
116 work = born->gpol_still_work;
118 shiftvec = fr->shift_vec[0];
122 jx = _mm_setzero_pd();
123 jy = _mm_setzero_pd();
124 jz = _mm_setzero_pd();
128 for(i=0;i<natoms;i++)
133 for(i=0;i<nl->nri;i++)
137 is3 = 3*nl->shift[i];
139 shY = shiftvec[is3+1];
140 shZ = shiftvec[is3+2];
142 nj1 = nl->jindex[i+1];
144 ix = _mm_set1_pd(shX+x[ii3+0]);
145 iy = _mm_set1_pd(shY+x[ii3+1]);
146 iz = _mm_set1_pd(shZ+x[ii3+2]);
149 /* Polarization energy for atom ai */
150 gpi = _mm_setzero_pd();
152 rai = _mm_load1_pd(gb_radius+ii);
153 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
155 for(k=nj0;k<nj1-1;k+=2)
163 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
165 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
166 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
168 dx = _mm_sub_pd(ix,jx);
169 dy = _mm_sub_pd(iy,jy);
170 dz = _mm_sub_pd(iz,jz);
172 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
173 rinv = gmx_mm_invsqrt_pd(rsq);
174 rinv2 = _mm_mul_pd(rinv,rinv);
175 rinv4 = _mm_mul_pd(rinv2,rinv2);
176 rinv6 = _mm_mul_pd(rinv4,rinv2);
178 rvdw = _mm_add_pd(rai,raj);
179 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
181 mask_cmp = _mm_cmple_pd(ratio,still_p5inv);
183 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
184 if( 0 == _mm_movemask_pd(mask_cmp) )
186 /* if ratio>still_p5inv for ALL elements */
188 dccf = _mm_setzero_pd();
192 ratio = _mm_min_pd(ratio,still_p5inv);
193 theta = _mm_mul_pd(ratio,still_pip5);
194 gmx_mm_sincos_pd(theta,&sinq,&cosq);
195 term = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
196 ccf = _mm_mul_pd(term,term);
197 dccf = _mm_mul_pd(_mm_mul_pd(two,term),
198 _mm_mul_pd(sinq,theta));
201 prod = _mm_mul_pd(still_p4,vaj);
202 icf4 = _mm_mul_pd(ccf,rinv4);
203 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
205 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
207 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
209 _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
211 _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
221 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
223 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
224 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
226 dx = _mm_sub_sd(ix,jx);
227 dy = _mm_sub_sd(iy,jy);
228 dz = _mm_sub_sd(iz,jz);
230 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
231 rinv = gmx_mm_invsqrt_pd(rsq);
232 rinv2 = _mm_mul_sd(rinv,rinv);
233 rinv4 = _mm_mul_sd(rinv2,rinv2);
234 rinv6 = _mm_mul_sd(rinv4,rinv2);
236 rvdw = _mm_add_sd(rai,raj);
237 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
239 mask_cmp = _mm_cmple_sd(ratio,still_p5inv);
241 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
242 if( 0 == _mm_movemask_pd(mask_cmp) )
244 /* if ratio>still_p5inv for ALL elements */
246 dccf = _mm_setzero_pd();
250 ratio = _mm_min_sd(ratio,still_p5inv);
251 theta = _mm_mul_sd(ratio,still_pip5);
252 gmx_mm_sincos_pd(theta,&sinq,&cosq);
253 term = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
254 ccf = _mm_mul_sd(term,term);
255 dccf = _mm_mul_sd(_mm_mul_sd(two,term),
256 _mm_mul_sd(sinq,theta));
259 prod = _mm_mul_sd(still_p4,vaj);
260 icf4 = _mm_mul_sd(ccf,rinv4);
261 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
263 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
265 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
267 _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
269 _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
272 gmx_mm_update_1pot_pd(gpi,work+ii);
275 /* Sum up the polarization energy from other nodes */
278 gmx_sum(natoms, work, cr);
280 else if(DOMAINDECOMP(cr))
282 dd_atom_sum_real(cr->dd, work);
285 /* Compute the radii */
286 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
288 if(born->use[i] != 0)
290 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
291 gpi2 = gpi_ai * gpi_ai;
292 born->bRad[i] = factor*gmx_invsqrt(gpi2);
293 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
297 /* Extra (local) communication required for DD */
300 dd_atom_spread_real(cr->dd, born->bRad);
301 dd_atom_spread_real(cr->dd, fr->invsqrta);
309 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
310 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
312 int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
316 double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
317 double sum_ai2, sum_ai3,tsum,tchain,doffset;
326 __m128d ix,iy,iz,jx,jy,jz;
327 __m128d dx,dy,dz,t1,t2,t3,t4;
329 __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
330 __m128d uij,lij2,uij2,lij3,uij3,diff2;
331 __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
332 __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
336 __m128d obc_mask1,obc_mask2,obc_mask3;
338 __m128d oneeighth = _mm_set1_pd(0.125);
339 __m128d onefourth = _mm_set1_pd(0.25);
341 const __m128d half = _mm_set1_pd(0.5);
342 const __m128d three = _mm_set1_pd(3.0);
343 const __m128d one = _mm_set1_pd(1.0);
344 const __m128d two = _mm_set1_pd(2.0);
345 const __m128d zero = _mm_set1_pd(0.0);
346 const __m128d neg = _mm_set1_pd(-1.0);
348 /* Set the dielectric offset */
349 doffset = born->gb_doffset;
350 gb_radius = born->gb_radius;
351 obc_param = born->param;
352 work = born->gpol_hct_work;
355 shiftvec = fr->shift_vec[0];
357 jx = _mm_setzero_pd();
358 jy = _mm_setzero_pd();
359 jz = _mm_setzero_pd();
363 for(i=0;i<born->nr;i++)
368 for(i=0;i<nl->nri;i++)
372 is3 = 3*nl->shift[i];
374 shY = shiftvec[is3+1];
375 shZ = shiftvec[is3+2];
377 nj1 = nl->jindex[i+1];
379 ix = _mm_set1_pd(shX+x[ii3+0]);
380 iy = _mm_set1_pd(shY+x[ii3+1]);
381 iz = _mm_set1_pd(shZ+x[ii3+2]);
383 rai = _mm_load1_pd(gb_radius+ii);
384 rai_inv= gmx_mm_inv_pd(rai);
386 sum_ai = _mm_setzero_pd();
388 sk_ai = _mm_load1_pd(born->param+ii);
389 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
391 for(k=nj0;k<nj1-1;k+=2)
399 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
400 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
401 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
403 dx = _mm_sub_pd(ix, jx);
404 dy = _mm_sub_pd(iy, jy);
405 dz = _mm_sub_pd(iz, jz);
407 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
409 rinv = gmx_mm_invsqrt_pd(rsq);
410 r = _mm_mul_pd(rsq,rinv);
412 /* Compute raj_inv aj1-4 */
413 raj_inv = gmx_mm_inv_pd(raj);
415 /* Evaluate influence of atom aj -> ai */
416 t1 = _mm_add_pd(r,sk_aj);
417 t2 = _mm_sub_pd(r,sk_aj);
418 t3 = _mm_sub_pd(sk_aj,r);
419 obc_mask1 = _mm_cmplt_pd(rai, t1);
420 obc_mask2 = _mm_cmplt_pd(rai, t2);
421 obc_mask3 = _mm_cmplt_pd(rai, t3);
423 uij = gmx_mm_inv_pd(t1);
424 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
425 _mm_andnot_pd(obc_mask2,rai_inv));
426 dlij = _mm_and_pd(one,obc_mask2);
427 uij2 = _mm_mul_pd(uij, uij);
428 uij3 = _mm_mul_pd(uij2,uij);
429 lij2 = _mm_mul_pd(lij, lij);
430 lij3 = _mm_mul_pd(lij2,lij);
432 diff2 = _mm_sub_pd(uij2,lij2);
433 lij_inv = gmx_mm_invsqrt_pd(lij2);
434 sk2_aj = _mm_mul_pd(sk_aj,sk_aj);
435 sk2_rinv = _mm_mul_pd(sk2_aj,rinv);
436 prod = _mm_mul_pd(onefourth,sk2_rinv);
438 logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
440 t1 = _mm_sub_pd(lij,uij);
441 t2 = _mm_mul_pd(diff2,
442 _mm_sub_pd(_mm_mul_pd(onefourth,r),
444 t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
445 t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3));
446 t4 = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
447 t4 = _mm_and_pd(t4,obc_mask3);
448 t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4));
450 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
452 t1 = _mm_add_pd(_mm_mul_pd(half,lij2),
453 _mm_mul_pd(prod,lij3));
455 _mm_mul_pd(onefourth,
456 _mm_add_pd(_mm_mul_pd(lij,rinv),
457 _mm_mul_pd(lij3,r))));
458 t2 = _mm_mul_pd(onefourth,
459 _mm_add_pd(_mm_mul_pd(uij,rinv),
460 _mm_mul_pd(uij3,r)));
462 _mm_add_pd(_mm_mul_pd(half,uij2),
463 _mm_mul_pd(prod,uij3)));
464 t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
465 _mm_mul_pd(rinv,rinv));
467 _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
469 _mm_mul_pd(sk2_rinv,rinv))));
470 t1 = _mm_mul_pd(rinv,
471 _mm_add_pd(_mm_mul_pd(dlij,t1),
474 dadx1 = _mm_and_pd(t1,obc_mask1);
476 /* Evaluate influence of atom ai -> aj */
477 t1 = _mm_add_pd(r,sk_ai);
478 t2 = _mm_sub_pd(r,sk_ai);
479 t3 = _mm_sub_pd(sk_ai,r);
480 obc_mask1 = _mm_cmplt_pd(raj, t1);
481 obc_mask2 = _mm_cmplt_pd(raj, t2);
482 obc_mask3 = _mm_cmplt_pd(raj, t3);
484 uij = gmx_mm_inv_pd(t1);
485 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
486 _mm_andnot_pd(obc_mask2,raj_inv));
487 dlij = _mm_and_pd(one,obc_mask2);
488 uij2 = _mm_mul_pd(uij, uij);
489 uij3 = _mm_mul_pd(uij2,uij);
490 lij2 = _mm_mul_pd(lij, lij);
491 lij3 = _mm_mul_pd(lij2,lij);
493 diff2 = _mm_sub_pd(uij2,lij2);
494 lij_inv = gmx_mm_invsqrt_pd(lij2);
495 sk2_rinv = _mm_mul_pd(sk2_ai,rinv);
496 prod = _mm_mul_pd(onefourth,sk2_rinv);
498 logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
500 t1 = _mm_sub_pd(lij,uij);
501 t2 = _mm_mul_pd(diff2,
502 _mm_sub_pd(_mm_mul_pd(onefourth,r),
504 t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
505 t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3));
506 t4 = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
507 t4 = _mm_and_pd(t4,obc_mask3);
508 t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4));
510 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
512 t1 = _mm_add_pd(_mm_mul_pd(half,lij2),
513 _mm_mul_pd(prod,lij3));
515 _mm_mul_pd(onefourth,
516 _mm_add_pd(_mm_mul_pd(lij,rinv),
517 _mm_mul_pd(lij3,r))));
518 t2 = _mm_mul_pd(onefourth,
519 _mm_add_pd(_mm_mul_pd(uij,rinv),
520 _mm_mul_pd(uij3,r)));
522 _mm_add_pd(_mm_mul_pd(half,uij2),
523 _mm_mul_pd(prod,uij3)));
524 t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
525 _mm_mul_pd(rinv,rinv));
527 _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
529 _mm_mul_pd(sk2_rinv,rinv))));
530 t1 = _mm_mul_pd(rinv,
531 _mm_add_pd(_mm_mul_pd(dlij,t1),
534 dadx2 = _mm_and_pd(t1,obc_mask1);
536 _mm_store_pd(dadx,dadx1);
538 _mm_store_pd(dadx,dadx2);
540 } /* end normal inner loop */
548 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
549 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
550 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
552 dx = _mm_sub_sd(ix, jx);
553 dy = _mm_sub_sd(iy, jy);
554 dz = _mm_sub_sd(iz, jz);
556 rsq = gmx_mm_calc_rsq_pd(dx,dy,dz);
558 rinv = gmx_mm_invsqrt_pd(rsq);
559 r = _mm_mul_sd(rsq,rinv);
561 /* Compute raj_inv aj1-4 */
562 raj_inv = gmx_mm_inv_pd(raj);
564 /* Evaluate influence of atom aj -> ai */
565 t1 = _mm_add_sd(r,sk_aj);
566 t2 = _mm_sub_sd(r,sk_aj);
567 t3 = _mm_sub_sd(sk_aj,r);
568 obc_mask1 = _mm_cmplt_sd(rai, t1);
569 obc_mask2 = _mm_cmplt_sd(rai, t2);
570 obc_mask3 = _mm_cmplt_sd(rai, t3);
572 uij = gmx_mm_inv_pd(t1);
573 lij = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
574 _mm_andnot_pd(obc_mask2,rai_inv));
575 dlij = _mm_and_pd(one,obc_mask2);
576 uij2 = _mm_mul_sd(uij, uij);
577 uij3 = _mm_mul_sd(uij2,uij);
578 lij2 = _mm_mul_sd(lij, lij);
579 lij3 = _mm_mul_sd(lij2,lij);
581 diff2 = _mm_sub_sd(uij2,lij2);
582 lij_inv = gmx_mm_invsqrt_pd(lij2);
583 sk2_aj = _mm_mul_sd(sk_aj,sk_aj);
584 sk2_rinv = _mm_mul_sd(sk2_aj,rinv);
585 prod = _mm_mul_sd(onefourth,sk2_rinv);
587 logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
589 t1 = _mm_sub_sd(lij,uij);
590 t2 = _mm_mul_sd(diff2,
591 _mm_sub_sd(_mm_mul_pd(onefourth,r),
593 t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
594 t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3));
595 t4 = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
596 t4 = _mm_and_pd(t4,obc_mask3);
597 t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4));
599 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
601 t1 = _mm_add_sd(_mm_mul_sd(half,lij2),
602 _mm_mul_sd(prod,lij3));
604 _mm_mul_sd(onefourth,
605 _mm_add_sd(_mm_mul_sd(lij,rinv),
606 _mm_mul_sd(lij3,r))));
607 t2 = _mm_mul_sd(onefourth,
608 _mm_add_sd(_mm_mul_sd(uij,rinv),
609 _mm_mul_sd(uij3,r)));
611 _mm_add_sd(_mm_mul_sd(half,uij2),
612 _mm_mul_sd(prod,uij3)));
613 t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
614 _mm_mul_sd(rinv,rinv));
616 _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
618 _mm_mul_sd(sk2_rinv,rinv))));
619 t1 = _mm_mul_sd(rinv,
620 _mm_add_sd(_mm_mul_sd(dlij,t1),
623 dadx1 = _mm_and_pd(t1,obc_mask1);
625 /* Evaluate influence of atom ai -> aj */
626 t1 = _mm_add_sd(r,sk_ai);
627 t2 = _mm_sub_sd(r,sk_ai);
628 t3 = _mm_sub_sd(sk_ai,r);
629 obc_mask1 = _mm_cmplt_sd(raj, t1);
630 obc_mask2 = _mm_cmplt_sd(raj, t2);
631 obc_mask3 = _mm_cmplt_sd(raj, t3);
633 uij = gmx_mm_inv_pd(t1);
634 lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
635 _mm_andnot_pd(obc_mask2,raj_inv));
636 dlij = _mm_and_pd(one,obc_mask2);
637 uij2 = _mm_mul_sd(uij, uij);
638 uij3 = _mm_mul_sd(uij2,uij);
639 lij2 = _mm_mul_sd(lij, lij);
640 lij3 = _mm_mul_sd(lij2,lij);
642 diff2 = _mm_sub_sd(uij2,lij2);
643 lij_inv = gmx_mm_invsqrt_pd(lij2);
644 sk2_rinv = _mm_mul_sd(sk2_ai,rinv);
645 prod = _mm_mul_sd(onefourth,sk2_rinv);
647 logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
649 t1 = _mm_sub_sd(lij,uij);
650 t2 = _mm_mul_sd(diff2,
651 _mm_sub_sd(_mm_mul_sd(onefourth,r),
653 t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
654 t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3));
655 t4 = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
656 t4 = _mm_and_pd(t4,obc_mask3);
657 t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4));
659 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
661 t1 = _mm_add_sd(_mm_mul_sd(half,lij2),
662 _mm_mul_sd(prod,lij3));
664 _mm_mul_sd(onefourth,
665 _mm_add_sd(_mm_mul_sd(lij,rinv),
666 _mm_mul_sd(lij3,r))));
667 t2 = _mm_mul_sd(onefourth,
668 _mm_add_sd(_mm_mul_sd(uij,rinv),
669 _mm_mul_sd(uij3,r)));
671 _mm_add_sd(_mm_mul_sd(half,uij2),
672 _mm_mul_sd(prod,uij3)));
673 t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
674 _mm_mul_sd(rinv,rinv));
676 _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
678 _mm_mul_sd(sk2_rinv,rinv))));
679 t1 = _mm_mul_sd(rinv,
680 _mm_add_sd(_mm_mul_sd(dlij,t1),
683 dadx2 = _mm_and_pd(t1,obc_mask1);
685 _mm_store_pd(dadx,dadx1);
687 _mm_store_pd(dadx,dadx2);
690 gmx_mm_update_1pot_pd(sum_ai,work+ii);
694 /* Parallel summations */
697 gmx_sum(natoms, work, cr);
699 else if(DOMAINDECOMP(cr))
701 dd_atom_sum_real(cr->dd, work);
704 if(gb_algorithm==egbHCT)
707 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
709 if(born->use[i] != 0)
711 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
712 sum = 1.0/rr - work[i];
713 min_rad = rr + doffset;
716 born->bRad[i] = rad > min_rad ? rad : min_rad;
717 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
721 /* Extra communication required for DD */
724 dd_atom_spread_real(cr->dd, born->bRad);
725 dd_atom_spread_real(cr->dd, fr->invsqrta);
731 for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
733 if(born->use[i] != 0)
735 rr = top->atomtypes.gb_radius[md->typeA[i]];
743 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
744 born->bRad[i] = rr_inv - tsum*rr_inv2;
745 born->bRad[i] = 1.0 / born->bRad[i];
747 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
749 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
750 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
753 /* Extra (local) communication required for DD */
756 dd_atom_spread_real(cr->dd, born->bRad);
757 dd_atom_spread_real(cr->dd, fr->invsqrta);
758 dd_atom_spread_real(cr->dd, born->drobc);
769 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
770 double *x, double *f, double *fshift, double *shiftvec,
771 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
773 int i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
778 double rbi,shX,shY,shZ;
787 __m128d rbai,rbaj,f_gb, f_gb_ai;
788 __m128d xmm1,xmm2,xmm3;
790 const __m128d two = _mm_set1_pd(2.0);
796 /* Loop to get the proper form for the Born radius term, sse style */
800 if(gb_algorithm==egbSTILL)
805 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
808 else if(gb_algorithm==egbHCT)
813 rb[i] = rbi * rbi * dvda[i];
816 else if(gb_algorithm==egbOBC)
821 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
825 jz = _mm_setzero_pd();
829 for(i=0;i<nl->nri;i++)
833 is3 = 3*nl->shift[i];
835 shY = shiftvec[is3+1];
836 shZ = shiftvec[is3+2];
838 nj1 = nl->jindex[i+1];
840 ix = _mm_set1_pd(shX+x[ii3+0]);
841 iy = _mm_set1_pd(shY+x[ii3+1]);
842 iz = _mm_set1_pd(shZ+x[ii3+2]);
844 rbai = _mm_load1_pd(rb+ii);
845 fix = _mm_setzero_pd();
846 fiy = _mm_setzero_pd();
847 fiz = _mm_setzero_pd();
850 for(k=nj0;k<nj1-1;k+=2)
858 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
860 dx = _mm_sub_pd(ix,jx);
861 dy = _mm_sub_pd(iy,jy);
862 dz = _mm_sub_pd(iz,jz);
864 GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
866 /* load chain rule terms for j1-4 */
867 f_gb = _mm_load_pd(dadx);
869 f_gb_ai = _mm_load_pd(dadx);
872 /* calculate scalar force */
873 f_gb = _mm_mul_pd(f_gb,rbai);
874 f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
875 f_gb = _mm_add_pd(f_gb,f_gb_ai);
877 tx = _mm_mul_pd(f_gb,dx);
878 ty = _mm_mul_pd(f_gb,dy);
879 tz = _mm_mul_pd(f_gb,dz);
881 fix = _mm_add_pd(fix,tx);
882 fiy = _mm_add_pd(fiy,ty);
883 fiz = _mm_add_pd(fiz,tz);
885 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
888 /*deal with odd elements */
894 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
896 dx = _mm_sub_sd(ix,jx);
897 dy = _mm_sub_sd(iy,jy);
898 dz = _mm_sub_sd(iz,jz);
900 GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
902 /* load chain rule terms */
903 f_gb = _mm_load_pd(dadx);
905 f_gb_ai = _mm_load_pd(dadx);
908 /* calculate scalar force */
909 f_gb = _mm_mul_sd(f_gb,rbai);
910 f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
911 f_gb = _mm_add_sd(f_gb,f_gb_ai);
913 tx = _mm_mul_sd(f_gb,dx);
914 ty = _mm_mul_sd(f_gb,dy);
915 tz = _mm_mul_sd(f_gb,dz);
917 fix = _mm_add_sd(fix,tx);
918 fiy = _mm_add_sd(fiy,ty);
919 fiz = _mm_add_sd(fiz,tz);
921 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
924 /* fix/fiy/fiz now contain four partial force terms, that all should be
925 * added to the i particle forces and shift forces.
927 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
934 /* keep compiler happy */
935 int genborn_sse2_dummy;
937 #endif /* SSE2 intrinsics available */