2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/fileio/pdbio.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
57 #include "gromacs/utility/gmxmpi.h"
59 /* Only compile this file if SSE2 intrinsics are available */
60 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
61 #include <gmx_sse2_double.h>
62 #include <emmintrin.h>
64 #include "genborn_sse2_double.h"
67 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
68 int natoms, gmx_localtop_t *top,
69 double *x, t_nblist *nl,
72 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
73 int jnrA, jnrB, j3A, j3B;
90 __m128d rsq, rinv, rinv2, rinv4, rinv6;
91 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
92 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
93 __m128d mask, icf4, icf6, mask_cmp;
95 const __m128d half = _mm_set1_pd(0.5);
96 const __m128d three = _mm_set1_pd(3.0);
97 const __m128d one = _mm_set1_pd(1.0);
98 const __m128d two = _mm_set1_pd(2.0);
99 const __m128d zero = _mm_set1_pd(0.0);
100 const __m128d four = _mm_set1_pd(4.0);
102 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
103 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
104 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
106 factor = 0.5 * ONE_4PI_EPS0;
108 gb_radius = born->gb_radius;
110 work = born->gpol_still_work;
112 shiftvec = fr->shift_vec[0];
116 jx = _mm_setzero_pd();
117 jy = _mm_setzero_pd();
118 jz = _mm_setzero_pd();
122 for (i = 0; i < natoms; i++)
127 for (i = 0; i < nl->nri; i++)
131 is3 = 3*nl->shift[i];
133 shY = shiftvec[is3+1];
134 shZ = shiftvec[is3+2];
136 nj1 = nl->jindex[i+1];
138 ix = _mm_set1_pd(shX+x[ii3+0]);
139 iy = _mm_set1_pd(shY+x[ii3+1]);
140 iz = _mm_set1_pd(shZ+x[ii3+2]);
143 /* Polarization energy for atom ai */
144 gpi = _mm_setzero_pd();
146 rai = _mm_load1_pd(gb_radius+ii);
147 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
149 for (k = nj0; k < nj1-1; k += 2)
157 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
159 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
160 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
162 dx = _mm_sub_pd(ix, jx);
163 dy = _mm_sub_pd(iy, jy);
164 dz = _mm_sub_pd(iz, jz);
166 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
167 rinv = gmx_mm_invsqrt_pd(rsq);
168 rinv2 = _mm_mul_pd(rinv, rinv);
169 rinv4 = _mm_mul_pd(rinv2, rinv2);
170 rinv6 = _mm_mul_pd(rinv4, rinv2);
172 rvdw = _mm_add_pd(rai, raj);
173 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
175 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
177 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
178 if (0 == _mm_movemask_pd(mask_cmp) )
180 /* if ratio>still_p5inv for ALL elements */
182 dccf = _mm_setzero_pd();
186 ratio = _mm_min_pd(ratio, still_p5inv);
187 theta = _mm_mul_pd(ratio, still_pip5);
188 gmx_mm_sincos_pd(theta, &sinq, &cosq);
189 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
190 ccf = _mm_mul_pd(term, term);
191 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
192 _mm_mul_pd(sinq, theta));
195 prod = _mm_mul_pd(still_p4, vaj);
196 icf4 = _mm_mul_pd(ccf, rinv4);
197 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
199 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
201 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
203 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
205 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
215 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
217 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
218 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
220 dx = _mm_sub_sd(ix, jx);
221 dy = _mm_sub_sd(iy, jy);
222 dz = _mm_sub_sd(iz, jz);
224 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
225 rinv = gmx_mm_invsqrt_pd(rsq);
226 rinv2 = _mm_mul_sd(rinv, rinv);
227 rinv4 = _mm_mul_sd(rinv2, rinv2);
228 rinv6 = _mm_mul_sd(rinv4, rinv2);
230 rvdw = _mm_add_sd(rai, raj);
231 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
233 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
235 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
236 if (0 == _mm_movemask_pd(mask_cmp) )
238 /* if ratio>still_p5inv for ALL elements */
240 dccf = _mm_setzero_pd();
244 ratio = _mm_min_sd(ratio, still_p5inv);
245 theta = _mm_mul_sd(ratio, still_pip5);
246 gmx_mm_sincos_pd(theta, &sinq, &cosq);
247 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
248 ccf = _mm_mul_sd(term, term);
249 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
250 _mm_mul_sd(sinq, theta));
253 prod = _mm_mul_sd(still_p4, vaj);
254 icf4 = _mm_mul_sd(ccf, rinv4);
255 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
257 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
259 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
261 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
263 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
266 gmx_mm_update_1pot_pd(gpi, work+ii);
269 /* Sum up the polarization energy from other nodes */
270 if (DOMAINDECOMP(cr))
272 dd_atom_sum_real(cr->dd, work);
275 /* Compute the radii */
276 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
278 if (born->use[i] != 0)
280 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
281 gpi2 = gpi_ai * gpi_ai;
282 born->bRad[i] = factor*gmx_invsqrt(gpi2);
283 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
287 /* Extra (local) communication required for DD */
288 if (DOMAINDECOMP(cr))
290 dd_atom_spread_real(cr->dd, born->bRad);
291 dd_atom_spread_real(cr->dd, fr->invsqrta);
299 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
300 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
302 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
305 double shX, shY, shZ;
306 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
307 double sum_ai2, sum_ai3, tsum, tchain, doffset;
316 __m128d ix, iy, iz, jx, jy, jz;
317 __m128d dx, dy, dz, t1, t2, t3, t4;
318 __m128d rsq, rinv, r;
319 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
320 __m128d uij, lij2, uij2, lij3, uij3, diff2;
321 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
322 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
323 __m128d dadx1, dadx2;
326 __m128d obc_mask1, obc_mask2, obc_mask3;
328 __m128d oneeighth = _mm_set1_pd(0.125);
329 __m128d onefourth = _mm_set1_pd(0.25);
331 const __m128d half = _mm_set1_pd(0.5);
332 const __m128d three = _mm_set1_pd(3.0);
333 const __m128d one = _mm_set1_pd(1.0);
334 const __m128d two = _mm_set1_pd(2.0);
335 const __m128d zero = _mm_set1_pd(0.0);
336 const __m128d neg = _mm_set1_pd(-1.0);
338 /* Set the dielectric offset */
339 doffset = born->gb_doffset;
340 gb_radius = born->gb_radius;
341 obc_param = born->param;
342 work = born->gpol_hct_work;
345 shiftvec = fr->shift_vec[0];
347 jx = _mm_setzero_pd();
348 jy = _mm_setzero_pd();
349 jz = _mm_setzero_pd();
353 for (i = 0; i < born->nr; i++)
358 for (i = 0; i < nl->nri; i++)
362 is3 = 3*nl->shift[i];
364 shY = shiftvec[is3+1];
365 shZ = shiftvec[is3+2];
367 nj1 = nl->jindex[i+1];
369 ix = _mm_set1_pd(shX+x[ii3+0]);
370 iy = _mm_set1_pd(shY+x[ii3+1]);
371 iz = _mm_set1_pd(shZ+x[ii3+2]);
373 rai = _mm_load1_pd(gb_radius+ii);
374 rai_inv = gmx_mm_inv_pd(rai);
376 sum_ai = _mm_setzero_pd();
378 sk_ai = _mm_load1_pd(born->param+ii);
379 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
381 for (k = nj0; k < nj1-1; k += 2)
389 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
390 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
391 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
393 dx = _mm_sub_pd(ix, jx);
394 dy = _mm_sub_pd(iy, jy);
395 dz = _mm_sub_pd(iz, jz);
397 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
399 rinv = gmx_mm_invsqrt_pd(rsq);
400 r = _mm_mul_pd(rsq, rinv);
402 /* Compute raj_inv aj1-4 */
403 raj_inv = gmx_mm_inv_pd(raj);
405 /* Evaluate influence of atom aj -> ai */
406 t1 = _mm_add_pd(r, sk_aj);
407 t2 = _mm_sub_pd(r, sk_aj);
408 t3 = _mm_sub_pd(sk_aj, r);
409 obc_mask1 = _mm_cmplt_pd(rai, t1);
410 obc_mask2 = _mm_cmplt_pd(rai, t2);
411 obc_mask3 = _mm_cmplt_pd(rai, t3);
413 uij = gmx_mm_inv_pd(t1);
414 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
415 _mm_andnot_pd(obc_mask2, rai_inv));
416 dlij = _mm_and_pd(one, obc_mask2);
417 uij2 = _mm_mul_pd(uij, uij);
418 uij3 = _mm_mul_pd(uij2, uij);
419 lij2 = _mm_mul_pd(lij, lij);
420 lij3 = _mm_mul_pd(lij2, lij);
422 diff2 = _mm_sub_pd(uij2, lij2);
423 lij_inv = gmx_mm_invsqrt_pd(lij2);
424 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
425 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
426 prod = _mm_mul_pd(onefourth, sk2_rinv);
428 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
430 t1 = _mm_sub_pd(lij, uij);
431 t2 = _mm_mul_pd(diff2,
432 _mm_sub_pd(_mm_mul_pd(onefourth, r),
434 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
435 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
436 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
437 t4 = _mm_and_pd(t4, obc_mask3);
438 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
440 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
442 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
443 _mm_mul_pd(prod, lij3));
445 _mm_mul_pd(onefourth,
446 _mm_add_pd(_mm_mul_pd(lij, rinv),
447 _mm_mul_pd(lij3, r))));
448 t2 = _mm_mul_pd(onefourth,
449 _mm_add_pd(_mm_mul_pd(uij, rinv),
450 _mm_mul_pd(uij3, r)));
452 _mm_add_pd(_mm_mul_pd(half, uij2),
453 _mm_mul_pd(prod, uij3)));
454 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
455 _mm_mul_pd(rinv, rinv));
457 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
459 _mm_mul_pd(sk2_rinv, rinv))));
460 t1 = _mm_mul_pd(rinv,
461 _mm_add_pd(_mm_mul_pd(dlij, t1),
462 _mm_add_pd(t2, t3)));
464 dadx1 = _mm_and_pd(t1, obc_mask1);
466 /* Evaluate influence of atom ai -> aj */
467 t1 = _mm_add_pd(r, sk_ai);
468 t2 = _mm_sub_pd(r, sk_ai);
469 t3 = _mm_sub_pd(sk_ai, r);
470 obc_mask1 = _mm_cmplt_pd(raj, t1);
471 obc_mask2 = _mm_cmplt_pd(raj, t2);
472 obc_mask3 = _mm_cmplt_pd(raj, t3);
474 uij = gmx_mm_inv_pd(t1);
475 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
476 _mm_andnot_pd(obc_mask2, raj_inv));
477 dlij = _mm_and_pd(one, obc_mask2);
478 uij2 = _mm_mul_pd(uij, uij);
479 uij3 = _mm_mul_pd(uij2, uij);
480 lij2 = _mm_mul_pd(lij, lij);
481 lij3 = _mm_mul_pd(lij2, lij);
483 diff2 = _mm_sub_pd(uij2, lij2);
484 lij_inv = gmx_mm_invsqrt_pd(lij2);
485 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
486 prod = _mm_mul_pd(onefourth, sk2_rinv);
488 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
490 t1 = _mm_sub_pd(lij, uij);
491 t2 = _mm_mul_pd(diff2,
492 _mm_sub_pd(_mm_mul_pd(onefourth, r),
494 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
495 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
496 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
497 t4 = _mm_and_pd(t4, obc_mask3);
498 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
500 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
502 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
503 _mm_mul_pd(prod, lij3));
505 _mm_mul_pd(onefourth,
506 _mm_add_pd(_mm_mul_pd(lij, rinv),
507 _mm_mul_pd(lij3, r))));
508 t2 = _mm_mul_pd(onefourth,
509 _mm_add_pd(_mm_mul_pd(uij, rinv),
510 _mm_mul_pd(uij3, r)));
512 _mm_add_pd(_mm_mul_pd(half, uij2),
513 _mm_mul_pd(prod, uij3)));
514 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
515 _mm_mul_pd(rinv, rinv));
517 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
519 _mm_mul_pd(sk2_rinv, rinv))));
520 t1 = _mm_mul_pd(rinv,
521 _mm_add_pd(_mm_mul_pd(dlij, t1),
522 _mm_add_pd(t2, t3)));
524 dadx2 = _mm_and_pd(t1, obc_mask1);
526 _mm_store_pd(dadx, dadx1);
528 _mm_store_pd(dadx, dadx2);
530 } /* end normal inner loop */
538 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
539 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
540 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
542 dx = _mm_sub_sd(ix, jx);
543 dy = _mm_sub_sd(iy, jy);
544 dz = _mm_sub_sd(iz, jz);
546 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
548 rinv = gmx_mm_invsqrt_pd(rsq);
549 r = _mm_mul_sd(rsq, rinv);
551 /* Compute raj_inv aj1-4 */
552 raj_inv = gmx_mm_inv_pd(raj);
554 /* Evaluate influence of atom aj -> ai */
555 t1 = _mm_add_sd(r, sk_aj);
556 t2 = _mm_sub_sd(r, sk_aj);
557 t3 = _mm_sub_sd(sk_aj, r);
558 obc_mask1 = _mm_cmplt_sd(rai, t1);
559 obc_mask2 = _mm_cmplt_sd(rai, t2);
560 obc_mask3 = _mm_cmplt_sd(rai, t3);
562 uij = gmx_mm_inv_pd(t1);
563 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
564 _mm_andnot_pd(obc_mask2, rai_inv));
565 dlij = _mm_and_pd(one, obc_mask2);
566 uij2 = _mm_mul_sd(uij, uij);
567 uij3 = _mm_mul_sd(uij2, uij);
568 lij2 = _mm_mul_sd(lij, lij);
569 lij3 = _mm_mul_sd(lij2, lij);
571 diff2 = _mm_sub_sd(uij2, lij2);
572 lij_inv = gmx_mm_invsqrt_pd(lij2);
573 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
574 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
575 prod = _mm_mul_sd(onefourth, sk2_rinv);
577 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
579 t1 = _mm_sub_sd(lij, uij);
580 t2 = _mm_mul_sd(diff2,
581 _mm_sub_sd(_mm_mul_pd(onefourth, r),
583 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
584 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
585 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
586 t4 = _mm_and_pd(t4, obc_mask3);
587 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
589 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
591 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
592 _mm_mul_sd(prod, lij3));
594 _mm_mul_sd(onefourth,
595 _mm_add_sd(_mm_mul_sd(lij, rinv),
596 _mm_mul_sd(lij3, r))));
597 t2 = _mm_mul_sd(onefourth,
598 _mm_add_sd(_mm_mul_sd(uij, rinv),
599 _mm_mul_sd(uij3, r)));
601 _mm_add_sd(_mm_mul_sd(half, uij2),
602 _mm_mul_sd(prod, uij3)));
603 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
604 _mm_mul_sd(rinv, rinv));
606 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
608 _mm_mul_sd(sk2_rinv, rinv))));
609 t1 = _mm_mul_sd(rinv,
610 _mm_add_sd(_mm_mul_sd(dlij, t1),
611 _mm_add_pd(t2, t3)));
613 dadx1 = _mm_and_pd(t1, obc_mask1);
615 /* Evaluate influence of atom ai -> aj */
616 t1 = _mm_add_sd(r, sk_ai);
617 t2 = _mm_sub_sd(r, sk_ai);
618 t3 = _mm_sub_sd(sk_ai, r);
619 obc_mask1 = _mm_cmplt_sd(raj, t1);
620 obc_mask2 = _mm_cmplt_sd(raj, t2);
621 obc_mask3 = _mm_cmplt_sd(raj, t3);
623 uij = gmx_mm_inv_pd(t1);
624 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
625 _mm_andnot_pd(obc_mask2, raj_inv));
626 dlij = _mm_and_pd(one, obc_mask2);
627 uij2 = _mm_mul_sd(uij, uij);
628 uij3 = _mm_mul_sd(uij2, uij);
629 lij2 = _mm_mul_sd(lij, lij);
630 lij3 = _mm_mul_sd(lij2, lij);
632 diff2 = _mm_sub_sd(uij2, lij2);
633 lij_inv = gmx_mm_invsqrt_pd(lij2);
634 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
635 prod = _mm_mul_sd(onefourth, sk2_rinv);
637 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
639 t1 = _mm_sub_sd(lij, uij);
640 t2 = _mm_mul_sd(diff2,
641 _mm_sub_sd(_mm_mul_sd(onefourth, r),
643 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
644 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
645 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
646 t4 = _mm_and_pd(t4, obc_mask3);
647 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
649 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
651 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
652 _mm_mul_sd(prod, lij3));
654 _mm_mul_sd(onefourth,
655 _mm_add_sd(_mm_mul_sd(lij, rinv),
656 _mm_mul_sd(lij3, r))));
657 t2 = _mm_mul_sd(onefourth,
658 _mm_add_sd(_mm_mul_sd(uij, rinv),
659 _mm_mul_sd(uij3, r)));
661 _mm_add_sd(_mm_mul_sd(half, uij2),
662 _mm_mul_sd(prod, uij3)));
663 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
664 _mm_mul_sd(rinv, rinv));
666 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
668 _mm_mul_sd(sk2_rinv, rinv))));
669 t1 = _mm_mul_sd(rinv,
670 _mm_add_sd(_mm_mul_sd(dlij, t1),
671 _mm_add_sd(t2, t3)));
673 dadx2 = _mm_and_pd(t1, obc_mask1);
675 _mm_store_pd(dadx, dadx1);
677 _mm_store_pd(dadx, dadx2);
680 gmx_mm_update_1pot_pd(sum_ai, work+ii);
684 /* Parallel summations */
685 if (DOMAINDECOMP(cr))
687 dd_atom_sum_real(cr->dd, work);
690 if (gb_algorithm == egbHCT)
693 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
695 if (born->use[i] != 0)
697 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
698 sum = 1.0/rr - work[i];
699 min_rad = rr + doffset;
702 born->bRad[i] = rad > min_rad ? rad : min_rad;
703 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
707 /* Extra communication required for DD */
708 if (DOMAINDECOMP(cr))
710 dd_atom_spread_real(cr->dd, born->bRad);
711 dd_atom_spread_real(cr->dd, fr->invsqrta);
717 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
719 if (born->use[i] != 0)
721 rr = top->atomtypes.gb_radius[md->typeA[i]];
729 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
730 born->bRad[i] = rr_inv - tsum*rr_inv2;
731 born->bRad[i] = 1.0 / born->bRad[i];
733 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
735 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
736 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
739 /* Extra (local) communication required for DD */
740 if (DOMAINDECOMP(cr))
742 dd_atom_spread_real(cr->dd, born->bRad);
743 dd_atom_spread_real(cr->dd, fr->invsqrta);
744 dd_atom_spread_real(cr->dd, born->drobc);
755 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
756 double *x, double *f, double *fshift, double *shiftvec,
757 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
759 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
764 double rbi, shX, shY, shZ;
769 __m128d fix, fiy, fiz;
773 __m128d rbai, rbaj, f_gb, f_gb_ai;
774 __m128d xmm1, xmm2, xmm3;
776 const __m128d two = _mm_set1_pd(2.0);
782 /* Loop to get the proper form for the Born radius term, sse style */
786 if (gb_algorithm == egbSTILL)
788 for (i = n0; i < n1; i++)
791 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
794 else if (gb_algorithm == egbHCT)
796 for (i = n0; i < n1; i++)
799 rb[i] = rbi * rbi * dvda[i];
802 else if (gb_algorithm == egbOBC)
804 for (i = n0; i < n1; i++)
807 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
811 jz = _mm_setzero_pd();
815 for (i = 0; i < nl->nri; i++)
819 is3 = 3*nl->shift[i];
821 shY = shiftvec[is3+1];
822 shZ = shiftvec[is3+2];
824 nj1 = nl->jindex[i+1];
826 ix = _mm_set1_pd(shX+x[ii3+0]);
827 iy = _mm_set1_pd(shY+x[ii3+1]);
828 iz = _mm_set1_pd(shZ+x[ii3+2]);
830 rbai = _mm_load1_pd(rb+ii);
831 fix = _mm_setzero_pd();
832 fiy = _mm_setzero_pd();
833 fiz = _mm_setzero_pd();
836 for (k = nj0; k < nj1-1; k += 2)
844 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
846 dx = _mm_sub_pd(ix, jx);
847 dy = _mm_sub_pd(iy, jy);
848 dz = _mm_sub_pd(iz, jz);
850 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
852 /* load chain rule terms for j1-4 */
853 f_gb = _mm_load_pd(dadx);
855 f_gb_ai = _mm_load_pd(dadx);
858 /* calculate scalar force */
859 f_gb = _mm_mul_pd(f_gb, rbai);
860 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
861 f_gb = _mm_add_pd(f_gb, f_gb_ai);
863 tx = _mm_mul_pd(f_gb, dx);
864 ty = _mm_mul_pd(f_gb, dy);
865 tz = _mm_mul_pd(f_gb, dz);
867 fix = _mm_add_pd(fix, tx);
868 fiy = _mm_add_pd(fiy, ty);
869 fiz = _mm_add_pd(fiz, tz);
871 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
874 /*deal with odd elements */
880 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
882 dx = _mm_sub_sd(ix, jx);
883 dy = _mm_sub_sd(iy, jy);
884 dz = _mm_sub_sd(iz, jz);
886 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
888 /* load chain rule terms */
889 f_gb = _mm_load_pd(dadx);
891 f_gb_ai = _mm_load_pd(dadx);
894 /* calculate scalar force */
895 f_gb = _mm_mul_sd(f_gb, rbai);
896 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
897 f_gb = _mm_add_sd(f_gb, f_gb_ai);
899 tx = _mm_mul_sd(f_gb, dx);
900 ty = _mm_mul_sd(f_gb, dy);
901 tz = _mm_mul_sd(f_gb, dz);
903 fix = _mm_add_sd(fix, tx);
904 fiy = _mm_add_sd(fiy, ty);
905 fiz = _mm_add_sd(fiz, tz);
907 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
910 /* fix/fiy/fiz now contain four partial force terms, that all should be
911 * added to the i particle forces and shift forces.
913 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
920 /* keep compiler happy */
921 int genborn_sse2_dummy;
923 #endif /* SSE2 intrinsics available */