2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
49 #include "gromacs/fileio/pdbio.h"
55 #include "gmx_fatal.h"
56 #include "mtop_util.h"
59 #include "gromacs/utility/gmxmpi.h"
61 /* Only compile this file if SSE2 intrinsics are available */
62 #if 0 && defined (GMX_X86_SSE2)
63 #include <gmx_sse2_double.h>
64 #include <emmintrin.h>
66 #include "genborn_sse2_double.h"
69 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
70 int natoms, gmx_localtop_t *top,
71 double *x, t_nblist *nl,
74 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
75 int jnrA, jnrB, j3A, j3B;
92 __m128d rsq, rinv, rinv2, rinv4, rinv6;
93 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
94 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
95 __m128d mask, icf4, icf6, mask_cmp;
97 const __m128d half = _mm_set1_pd(0.5);
98 const __m128d three = _mm_set1_pd(3.0);
99 const __m128d one = _mm_set1_pd(1.0);
100 const __m128d two = _mm_set1_pd(2.0);
101 const __m128d zero = _mm_set1_pd(0.0);
102 const __m128d four = _mm_set1_pd(4.0);
104 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
105 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
106 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
108 factor = 0.5 * ONE_4PI_EPS0;
110 gb_radius = born->gb_radius;
112 work = born->gpol_still_work;
114 shiftvec = fr->shift_vec[0];
118 jx = _mm_setzero_pd();
119 jy = _mm_setzero_pd();
120 jz = _mm_setzero_pd();
124 for (i = 0; i < natoms; i++)
129 for (i = 0; i < nl->nri; i++)
133 is3 = 3*nl->shift[i];
135 shY = shiftvec[is3+1];
136 shZ = shiftvec[is3+2];
138 nj1 = nl->jindex[i+1];
140 ix = _mm_set1_pd(shX+x[ii3+0]);
141 iy = _mm_set1_pd(shY+x[ii3+1]);
142 iz = _mm_set1_pd(shZ+x[ii3+2]);
145 /* Polarization energy for atom ai */
146 gpi = _mm_setzero_pd();
148 rai = _mm_load1_pd(gb_radius+ii);
149 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
151 for (k = nj0; k < nj1-1; k += 2)
159 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
161 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
162 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
164 dx = _mm_sub_pd(ix, jx);
165 dy = _mm_sub_pd(iy, jy);
166 dz = _mm_sub_pd(iz, jz);
168 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
169 rinv = gmx_mm_invsqrt_pd(rsq);
170 rinv2 = _mm_mul_pd(rinv, rinv);
171 rinv4 = _mm_mul_pd(rinv2, rinv2);
172 rinv6 = _mm_mul_pd(rinv4, rinv2);
174 rvdw = _mm_add_pd(rai, raj);
175 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
177 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
179 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
180 if (0 == _mm_movemask_pd(mask_cmp) )
182 /* if ratio>still_p5inv for ALL elements */
184 dccf = _mm_setzero_pd();
188 ratio = _mm_min_pd(ratio, still_p5inv);
189 theta = _mm_mul_pd(ratio, still_pip5);
190 gmx_mm_sincos_pd(theta, &sinq, &cosq);
191 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
192 ccf = _mm_mul_pd(term, term);
193 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
194 _mm_mul_pd(sinq, theta));
197 prod = _mm_mul_pd(still_p4, vaj);
198 icf4 = _mm_mul_pd(ccf, rinv4);
199 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
201 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
203 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
205 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
207 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
217 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
219 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
220 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
222 dx = _mm_sub_sd(ix, jx);
223 dy = _mm_sub_sd(iy, jy);
224 dz = _mm_sub_sd(iz, jz);
226 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
227 rinv = gmx_mm_invsqrt_pd(rsq);
228 rinv2 = _mm_mul_sd(rinv, rinv);
229 rinv4 = _mm_mul_sd(rinv2, rinv2);
230 rinv6 = _mm_mul_sd(rinv4, rinv2);
232 rvdw = _mm_add_sd(rai, raj);
233 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
235 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
237 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
238 if (0 == _mm_movemask_pd(mask_cmp) )
240 /* if ratio>still_p5inv for ALL elements */
242 dccf = _mm_setzero_pd();
246 ratio = _mm_min_sd(ratio, still_p5inv);
247 theta = _mm_mul_sd(ratio, still_pip5);
248 gmx_mm_sincos_pd(theta, &sinq, &cosq);
249 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
250 ccf = _mm_mul_sd(term, term);
251 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
252 _mm_mul_sd(sinq, theta));
255 prod = _mm_mul_sd(still_p4, vaj);
256 icf4 = _mm_mul_sd(ccf, rinv4);
257 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
259 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
261 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
263 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
265 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
268 gmx_mm_update_1pot_pd(gpi, work+ii);
271 /* Sum up the polarization energy from other nodes */
274 gmx_sum(natoms, work, cr);
276 else if (DOMAINDECOMP(cr))
278 dd_atom_sum_real(cr->dd, work);
281 /* Compute the radii */
282 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
284 if (born->use[i] != 0)
286 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
287 gpi2 = gpi_ai * gpi_ai;
288 born->bRad[i] = factor*gmx_invsqrt(gpi2);
289 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
293 /* Extra (local) communication required for DD */
294 if (DOMAINDECOMP(cr))
296 dd_atom_spread_real(cr->dd, born->bRad);
297 dd_atom_spread_real(cr->dd, fr->invsqrta);
305 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
306 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
308 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
311 double shX, shY, shZ;
312 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
313 double sum_ai2, sum_ai3, tsum, tchain, doffset;
322 __m128d ix, iy, iz, jx, jy, jz;
323 __m128d dx, dy, dz, t1, t2, t3, t4;
324 __m128d rsq, rinv, r;
325 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
326 __m128d uij, lij2, uij2, lij3, uij3, diff2;
327 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
328 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
329 __m128d dadx1, dadx2;
332 __m128d obc_mask1, obc_mask2, obc_mask3;
334 __m128d oneeighth = _mm_set1_pd(0.125);
335 __m128d onefourth = _mm_set1_pd(0.25);
337 const __m128d half = _mm_set1_pd(0.5);
338 const __m128d three = _mm_set1_pd(3.0);
339 const __m128d one = _mm_set1_pd(1.0);
340 const __m128d two = _mm_set1_pd(2.0);
341 const __m128d zero = _mm_set1_pd(0.0);
342 const __m128d neg = _mm_set1_pd(-1.0);
344 /* Set the dielectric offset */
345 doffset = born->gb_doffset;
346 gb_radius = born->gb_radius;
347 obc_param = born->param;
348 work = born->gpol_hct_work;
351 shiftvec = fr->shift_vec[0];
353 jx = _mm_setzero_pd();
354 jy = _mm_setzero_pd();
355 jz = _mm_setzero_pd();
359 for (i = 0; i < born->nr; i++)
364 for (i = 0; i < nl->nri; i++)
368 is3 = 3*nl->shift[i];
370 shY = shiftvec[is3+1];
371 shZ = shiftvec[is3+2];
373 nj1 = nl->jindex[i+1];
375 ix = _mm_set1_pd(shX+x[ii3+0]);
376 iy = _mm_set1_pd(shY+x[ii3+1]);
377 iz = _mm_set1_pd(shZ+x[ii3+2]);
379 rai = _mm_load1_pd(gb_radius+ii);
380 rai_inv = gmx_mm_inv_pd(rai);
382 sum_ai = _mm_setzero_pd();
384 sk_ai = _mm_load1_pd(born->param+ii);
385 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
387 for (k = nj0; k < nj1-1; k += 2)
395 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
396 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
397 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
399 dx = _mm_sub_pd(ix, jx);
400 dy = _mm_sub_pd(iy, jy);
401 dz = _mm_sub_pd(iz, jz);
403 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
405 rinv = gmx_mm_invsqrt_pd(rsq);
406 r = _mm_mul_pd(rsq, rinv);
408 /* Compute raj_inv aj1-4 */
409 raj_inv = gmx_mm_inv_pd(raj);
411 /* Evaluate influence of atom aj -> ai */
412 t1 = _mm_add_pd(r, sk_aj);
413 t2 = _mm_sub_pd(r, sk_aj);
414 t3 = _mm_sub_pd(sk_aj, r);
415 obc_mask1 = _mm_cmplt_pd(rai, t1);
416 obc_mask2 = _mm_cmplt_pd(rai, t2);
417 obc_mask3 = _mm_cmplt_pd(rai, t3);
419 uij = gmx_mm_inv_pd(t1);
420 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
421 _mm_andnot_pd(obc_mask2, rai_inv));
422 dlij = _mm_and_pd(one, obc_mask2);
423 uij2 = _mm_mul_pd(uij, uij);
424 uij3 = _mm_mul_pd(uij2, uij);
425 lij2 = _mm_mul_pd(lij, lij);
426 lij3 = _mm_mul_pd(lij2, lij);
428 diff2 = _mm_sub_pd(uij2, lij2);
429 lij_inv = gmx_mm_invsqrt_pd(lij2);
430 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
431 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
432 prod = _mm_mul_pd(onefourth, sk2_rinv);
434 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
436 t1 = _mm_sub_pd(lij, uij);
437 t2 = _mm_mul_pd(diff2,
438 _mm_sub_pd(_mm_mul_pd(onefourth, r),
440 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
441 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
442 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
443 t4 = _mm_and_pd(t4, obc_mask3);
444 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
446 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
448 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
449 _mm_mul_pd(prod, lij3));
451 _mm_mul_pd(onefourth,
452 _mm_add_pd(_mm_mul_pd(lij, rinv),
453 _mm_mul_pd(lij3, r))));
454 t2 = _mm_mul_pd(onefourth,
455 _mm_add_pd(_mm_mul_pd(uij, rinv),
456 _mm_mul_pd(uij3, r)));
458 _mm_add_pd(_mm_mul_pd(half, uij2),
459 _mm_mul_pd(prod, uij3)));
460 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
461 _mm_mul_pd(rinv, rinv));
463 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
465 _mm_mul_pd(sk2_rinv, rinv))));
466 t1 = _mm_mul_pd(rinv,
467 _mm_add_pd(_mm_mul_pd(dlij, t1),
468 _mm_add_pd(t2, t3)));
470 dadx1 = _mm_and_pd(t1, obc_mask1);
472 /* Evaluate influence of atom ai -> aj */
473 t1 = _mm_add_pd(r, sk_ai);
474 t2 = _mm_sub_pd(r, sk_ai);
475 t3 = _mm_sub_pd(sk_ai, r);
476 obc_mask1 = _mm_cmplt_pd(raj, t1);
477 obc_mask2 = _mm_cmplt_pd(raj, t2);
478 obc_mask3 = _mm_cmplt_pd(raj, t3);
480 uij = gmx_mm_inv_pd(t1);
481 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
482 _mm_andnot_pd(obc_mask2, raj_inv));
483 dlij = _mm_and_pd(one, obc_mask2);
484 uij2 = _mm_mul_pd(uij, uij);
485 uij3 = _mm_mul_pd(uij2, uij);
486 lij2 = _mm_mul_pd(lij, lij);
487 lij3 = _mm_mul_pd(lij2, lij);
489 diff2 = _mm_sub_pd(uij2, lij2);
490 lij_inv = gmx_mm_invsqrt_pd(lij2);
491 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
492 prod = _mm_mul_pd(onefourth, sk2_rinv);
494 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
496 t1 = _mm_sub_pd(lij, uij);
497 t2 = _mm_mul_pd(diff2,
498 _mm_sub_pd(_mm_mul_pd(onefourth, r),
500 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
501 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
502 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
503 t4 = _mm_and_pd(t4, obc_mask3);
504 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
506 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
508 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
509 _mm_mul_pd(prod, lij3));
511 _mm_mul_pd(onefourth,
512 _mm_add_pd(_mm_mul_pd(lij, rinv),
513 _mm_mul_pd(lij3, r))));
514 t2 = _mm_mul_pd(onefourth,
515 _mm_add_pd(_mm_mul_pd(uij, rinv),
516 _mm_mul_pd(uij3, r)));
518 _mm_add_pd(_mm_mul_pd(half, uij2),
519 _mm_mul_pd(prod, uij3)));
520 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
521 _mm_mul_pd(rinv, rinv));
523 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
525 _mm_mul_pd(sk2_rinv, rinv))));
526 t1 = _mm_mul_pd(rinv,
527 _mm_add_pd(_mm_mul_pd(dlij, t1),
528 _mm_add_pd(t2, t3)));
530 dadx2 = _mm_and_pd(t1, obc_mask1);
532 _mm_store_pd(dadx, dadx1);
534 _mm_store_pd(dadx, dadx2);
536 } /* end normal inner loop */
544 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
545 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
546 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
548 dx = _mm_sub_sd(ix, jx);
549 dy = _mm_sub_sd(iy, jy);
550 dz = _mm_sub_sd(iz, jz);
552 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
554 rinv = gmx_mm_invsqrt_pd(rsq);
555 r = _mm_mul_sd(rsq, rinv);
557 /* Compute raj_inv aj1-4 */
558 raj_inv = gmx_mm_inv_pd(raj);
560 /* Evaluate influence of atom aj -> ai */
561 t1 = _mm_add_sd(r, sk_aj);
562 t2 = _mm_sub_sd(r, sk_aj);
563 t3 = _mm_sub_sd(sk_aj, r);
564 obc_mask1 = _mm_cmplt_sd(rai, t1);
565 obc_mask2 = _mm_cmplt_sd(rai, t2);
566 obc_mask3 = _mm_cmplt_sd(rai, t3);
568 uij = gmx_mm_inv_pd(t1);
569 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
570 _mm_andnot_pd(obc_mask2, rai_inv));
571 dlij = _mm_and_pd(one, obc_mask2);
572 uij2 = _mm_mul_sd(uij, uij);
573 uij3 = _mm_mul_sd(uij2, uij);
574 lij2 = _mm_mul_sd(lij, lij);
575 lij3 = _mm_mul_sd(lij2, lij);
577 diff2 = _mm_sub_sd(uij2, lij2);
578 lij_inv = gmx_mm_invsqrt_pd(lij2);
579 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
580 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
581 prod = _mm_mul_sd(onefourth, sk2_rinv);
583 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
585 t1 = _mm_sub_sd(lij, uij);
586 t2 = _mm_mul_sd(diff2,
587 _mm_sub_sd(_mm_mul_pd(onefourth, r),
589 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
590 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
591 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
592 t4 = _mm_and_pd(t4, obc_mask3);
593 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
595 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
597 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
598 _mm_mul_sd(prod, lij3));
600 _mm_mul_sd(onefourth,
601 _mm_add_sd(_mm_mul_sd(lij, rinv),
602 _mm_mul_sd(lij3, r))));
603 t2 = _mm_mul_sd(onefourth,
604 _mm_add_sd(_mm_mul_sd(uij, rinv),
605 _mm_mul_sd(uij3, r)));
607 _mm_add_sd(_mm_mul_sd(half, uij2),
608 _mm_mul_sd(prod, uij3)));
609 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
610 _mm_mul_sd(rinv, rinv));
612 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
614 _mm_mul_sd(sk2_rinv, rinv))));
615 t1 = _mm_mul_sd(rinv,
616 _mm_add_sd(_mm_mul_sd(dlij, t1),
617 _mm_add_pd(t2, t3)));
619 dadx1 = _mm_and_pd(t1, obc_mask1);
621 /* Evaluate influence of atom ai -> aj */
622 t1 = _mm_add_sd(r, sk_ai);
623 t2 = _mm_sub_sd(r, sk_ai);
624 t3 = _mm_sub_sd(sk_ai, r);
625 obc_mask1 = _mm_cmplt_sd(raj, t1);
626 obc_mask2 = _mm_cmplt_sd(raj, t2);
627 obc_mask3 = _mm_cmplt_sd(raj, t3);
629 uij = gmx_mm_inv_pd(t1);
630 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
631 _mm_andnot_pd(obc_mask2, raj_inv));
632 dlij = _mm_and_pd(one, obc_mask2);
633 uij2 = _mm_mul_sd(uij, uij);
634 uij3 = _mm_mul_sd(uij2, uij);
635 lij2 = _mm_mul_sd(lij, lij);
636 lij3 = _mm_mul_sd(lij2, lij);
638 diff2 = _mm_sub_sd(uij2, lij2);
639 lij_inv = gmx_mm_invsqrt_pd(lij2);
640 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
641 prod = _mm_mul_sd(onefourth, sk2_rinv);
643 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
645 t1 = _mm_sub_sd(lij, uij);
646 t2 = _mm_mul_sd(diff2,
647 _mm_sub_sd(_mm_mul_sd(onefourth, r),
649 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
650 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
651 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
652 t4 = _mm_and_pd(t4, obc_mask3);
653 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
655 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
657 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
658 _mm_mul_sd(prod, lij3));
660 _mm_mul_sd(onefourth,
661 _mm_add_sd(_mm_mul_sd(lij, rinv),
662 _mm_mul_sd(lij3, r))));
663 t2 = _mm_mul_sd(onefourth,
664 _mm_add_sd(_mm_mul_sd(uij, rinv),
665 _mm_mul_sd(uij3, r)));
667 _mm_add_sd(_mm_mul_sd(half, uij2),
668 _mm_mul_sd(prod, uij3)));
669 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
670 _mm_mul_sd(rinv, rinv));
672 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
674 _mm_mul_sd(sk2_rinv, rinv))));
675 t1 = _mm_mul_sd(rinv,
676 _mm_add_sd(_mm_mul_sd(dlij, t1),
677 _mm_add_sd(t2, t3)));
679 dadx2 = _mm_and_pd(t1, obc_mask1);
681 _mm_store_pd(dadx, dadx1);
683 _mm_store_pd(dadx, dadx2);
686 gmx_mm_update_1pot_pd(sum_ai, work+ii);
690 /* Parallel summations */
693 gmx_sum(natoms, work, cr);
695 else if (DOMAINDECOMP(cr))
697 dd_atom_sum_real(cr->dd, work);
700 if (gb_algorithm == egbHCT)
703 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
705 if (born->use[i] != 0)
707 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
708 sum = 1.0/rr - work[i];
709 min_rad = rr + doffset;
712 born->bRad[i] = rad > min_rad ? rad : min_rad;
713 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
717 /* Extra communication required for DD */
718 if (DOMAINDECOMP(cr))
720 dd_atom_spread_real(cr->dd, born->bRad);
721 dd_atom_spread_real(cr->dd, fr->invsqrta);
727 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
729 if (born->use[i] != 0)
731 rr = top->atomtypes.gb_radius[md->typeA[i]];
739 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
740 born->bRad[i] = rr_inv - tsum*rr_inv2;
741 born->bRad[i] = 1.0 / born->bRad[i];
743 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
745 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
746 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
749 /* Extra (local) communication required for DD */
750 if (DOMAINDECOMP(cr))
752 dd_atom_spread_real(cr->dd, born->bRad);
753 dd_atom_spread_real(cr->dd, fr->invsqrta);
754 dd_atom_spread_real(cr->dd, born->drobc);
765 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
766 double *x, double *f, double *fshift, double *shiftvec,
767 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
769 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
774 double rbi, shX, shY, shZ;
779 __m128d fix, fiy, fiz;
783 __m128d rbai, rbaj, f_gb, f_gb_ai;
784 __m128d xmm1, xmm2, xmm3;
786 const __m128d two = _mm_set1_pd(2.0);
792 /* Loop to get the proper form for the Born radius term, sse style */
796 if (gb_algorithm == egbSTILL)
798 for (i = n0; i < n1; i++)
801 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
804 else if (gb_algorithm == egbHCT)
806 for (i = n0; i < n1; i++)
809 rb[i] = rbi * rbi * dvda[i];
812 else if (gb_algorithm == egbOBC)
814 for (i = n0; i < n1; i++)
817 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
821 jz = _mm_setzero_pd();
825 for (i = 0; i < nl->nri; i++)
829 is3 = 3*nl->shift[i];
831 shY = shiftvec[is3+1];
832 shZ = shiftvec[is3+2];
834 nj1 = nl->jindex[i+1];
836 ix = _mm_set1_pd(shX+x[ii3+0]);
837 iy = _mm_set1_pd(shY+x[ii3+1]);
838 iz = _mm_set1_pd(shZ+x[ii3+2]);
840 rbai = _mm_load1_pd(rb+ii);
841 fix = _mm_setzero_pd();
842 fiy = _mm_setzero_pd();
843 fiz = _mm_setzero_pd();
846 for (k = nj0; k < nj1-1; k += 2)
854 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
856 dx = _mm_sub_pd(ix, jx);
857 dy = _mm_sub_pd(iy, jy);
858 dz = _mm_sub_pd(iz, jz);
860 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
862 /* load chain rule terms for j1-4 */
863 f_gb = _mm_load_pd(dadx);
865 f_gb_ai = _mm_load_pd(dadx);
868 /* calculate scalar force */
869 f_gb = _mm_mul_pd(f_gb, rbai);
870 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
871 f_gb = _mm_add_pd(f_gb, f_gb_ai);
873 tx = _mm_mul_pd(f_gb, dx);
874 ty = _mm_mul_pd(f_gb, dy);
875 tz = _mm_mul_pd(f_gb, dz);
877 fix = _mm_add_pd(fix, tx);
878 fiy = _mm_add_pd(fiy, ty);
879 fiz = _mm_add_pd(fiz, tz);
881 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
884 /*deal with odd elements */
890 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
892 dx = _mm_sub_sd(ix, jx);
893 dy = _mm_sub_sd(iy, jy);
894 dz = _mm_sub_sd(iz, jz);
896 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
898 /* load chain rule terms */
899 f_gb = _mm_load_pd(dadx);
901 f_gb_ai = _mm_load_pd(dadx);
904 /* calculate scalar force */
905 f_gb = _mm_mul_sd(f_gb, rbai);
906 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
907 f_gb = _mm_add_sd(f_gb, f_gb_ai);
909 tx = _mm_mul_sd(f_gb, dx);
910 ty = _mm_mul_sd(f_gb, dy);
911 tz = _mm_mul_sd(f_gb, dz);
913 fix = _mm_add_sd(fix, tx);
914 fiy = _mm_add_sd(fiy, ty);
915 fiz = _mm_add_sd(fiz, tz);
917 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
920 /* fix/fiy/fiz now contain four partial force terms, that all should be
921 * added to the i particle forces and shift forces.
923 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
930 /* keep compiler happy */
931 int genborn_sse2_dummy;
933 #endif /* SSE2 intrinsics available */