2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/fileio/pdbio.h"
54 #include "gmx_fatal.h"
55 #include "mtop_util.h"
58 #include "gromacs/utility/gmxmpi.h"
60 /* Only compile this file if SSE2 intrinsics are available */
61 #if 0 && defined (GMX_X86_SSE2)
62 #include <gmx_sse2_double.h>
63 #include <emmintrin.h>
65 #include "genborn_sse2_double.h"
68 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
69 int natoms, gmx_localtop_t *top,
70 double *x, t_nblist *nl,
73 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
74 int jnrA, jnrB, j3A, j3B;
91 __m128d rsq, rinv, rinv2, rinv4, rinv6;
92 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
93 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
94 __m128d mask, icf4, icf6, mask_cmp;
96 const __m128d half = _mm_set1_pd(0.5);
97 const __m128d three = _mm_set1_pd(3.0);
98 const __m128d one = _mm_set1_pd(1.0);
99 const __m128d two = _mm_set1_pd(2.0);
100 const __m128d zero = _mm_set1_pd(0.0);
101 const __m128d four = _mm_set1_pd(4.0);
103 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
104 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
105 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
107 factor = 0.5 * ONE_4PI_EPS0;
109 gb_radius = born->gb_radius;
111 work = born->gpol_still_work;
113 shiftvec = fr->shift_vec[0];
117 jx = _mm_setzero_pd();
118 jy = _mm_setzero_pd();
119 jz = _mm_setzero_pd();
123 for (i = 0; i < natoms; i++)
128 for (i = 0; i < nl->nri; i++)
132 is3 = 3*nl->shift[i];
134 shY = shiftvec[is3+1];
135 shZ = shiftvec[is3+2];
137 nj1 = nl->jindex[i+1];
139 ix = _mm_set1_pd(shX+x[ii3+0]);
140 iy = _mm_set1_pd(shY+x[ii3+1]);
141 iz = _mm_set1_pd(shZ+x[ii3+2]);
144 /* Polarization energy for atom ai */
145 gpi = _mm_setzero_pd();
147 rai = _mm_load1_pd(gb_radius+ii);
148 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
150 for (k = nj0; k < nj1-1; k += 2)
158 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
160 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
161 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
163 dx = _mm_sub_pd(ix, jx);
164 dy = _mm_sub_pd(iy, jy);
165 dz = _mm_sub_pd(iz, jz);
167 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
168 rinv = gmx_mm_invsqrt_pd(rsq);
169 rinv2 = _mm_mul_pd(rinv, rinv);
170 rinv4 = _mm_mul_pd(rinv2, rinv2);
171 rinv6 = _mm_mul_pd(rinv4, rinv2);
173 rvdw = _mm_add_pd(rai, raj);
174 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
176 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
178 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
179 if (0 == _mm_movemask_pd(mask_cmp) )
181 /* if ratio>still_p5inv for ALL elements */
183 dccf = _mm_setzero_pd();
187 ratio = _mm_min_pd(ratio, still_p5inv);
188 theta = _mm_mul_pd(ratio, still_pip5);
189 gmx_mm_sincos_pd(theta, &sinq, &cosq);
190 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
191 ccf = _mm_mul_pd(term, term);
192 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
193 _mm_mul_pd(sinq, theta));
196 prod = _mm_mul_pd(still_p4, vaj);
197 icf4 = _mm_mul_pd(ccf, rinv4);
198 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
200 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
202 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
204 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
206 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
216 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
218 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
219 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
221 dx = _mm_sub_sd(ix, jx);
222 dy = _mm_sub_sd(iy, jy);
223 dz = _mm_sub_sd(iz, jz);
225 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
226 rinv = gmx_mm_invsqrt_pd(rsq);
227 rinv2 = _mm_mul_sd(rinv, rinv);
228 rinv4 = _mm_mul_sd(rinv2, rinv2);
229 rinv6 = _mm_mul_sd(rinv4, rinv2);
231 rvdw = _mm_add_sd(rai, raj);
232 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
234 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
236 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
237 if (0 == _mm_movemask_pd(mask_cmp) )
239 /* if ratio>still_p5inv for ALL elements */
241 dccf = _mm_setzero_pd();
245 ratio = _mm_min_sd(ratio, still_p5inv);
246 theta = _mm_mul_sd(ratio, still_pip5);
247 gmx_mm_sincos_pd(theta, &sinq, &cosq);
248 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
249 ccf = _mm_mul_sd(term, term);
250 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
251 _mm_mul_sd(sinq, theta));
254 prod = _mm_mul_sd(still_p4, vaj);
255 icf4 = _mm_mul_sd(ccf, rinv4);
256 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
258 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
260 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
262 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
264 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
267 gmx_mm_update_1pot_pd(gpi, work+ii);
270 /* Sum up the polarization energy from other nodes */
273 gmx_sum(natoms, work, cr);
275 else if (DOMAINDECOMP(cr))
277 dd_atom_sum_real(cr->dd, work);
280 /* Compute the radii */
281 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
283 if (born->use[i] != 0)
285 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
286 gpi2 = gpi_ai * gpi_ai;
287 born->bRad[i] = factor*gmx_invsqrt(gpi2);
288 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
292 /* Extra (local) communication required for DD */
293 if (DOMAINDECOMP(cr))
295 dd_atom_spread_real(cr->dd, born->bRad);
296 dd_atom_spread_real(cr->dd, fr->invsqrta);
304 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
305 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
307 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
310 double shX, shY, shZ;
311 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
312 double sum_ai2, sum_ai3, tsum, tchain, doffset;
321 __m128d ix, iy, iz, jx, jy, jz;
322 __m128d dx, dy, dz, t1, t2, t3, t4;
323 __m128d rsq, rinv, r;
324 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
325 __m128d uij, lij2, uij2, lij3, uij3, diff2;
326 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
327 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
328 __m128d dadx1, dadx2;
331 __m128d obc_mask1, obc_mask2, obc_mask3;
333 __m128d oneeighth = _mm_set1_pd(0.125);
334 __m128d onefourth = _mm_set1_pd(0.25);
336 const __m128d half = _mm_set1_pd(0.5);
337 const __m128d three = _mm_set1_pd(3.0);
338 const __m128d one = _mm_set1_pd(1.0);
339 const __m128d two = _mm_set1_pd(2.0);
340 const __m128d zero = _mm_set1_pd(0.0);
341 const __m128d neg = _mm_set1_pd(-1.0);
343 /* Set the dielectric offset */
344 doffset = born->gb_doffset;
345 gb_radius = born->gb_radius;
346 obc_param = born->param;
347 work = born->gpol_hct_work;
350 shiftvec = fr->shift_vec[0];
352 jx = _mm_setzero_pd();
353 jy = _mm_setzero_pd();
354 jz = _mm_setzero_pd();
358 for (i = 0; i < born->nr; i++)
363 for (i = 0; i < nl->nri; i++)
367 is3 = 3*nl->shift[i];
369 shY = shiftvec[is3+1];
370 shZ = shiftvec[is3+2];
372 nj1 = nl->jindex[i+1];
374 ix = _mm_set1_pd(shX+x[ii3+0]);
375 iy = _mm_set1_pd(shY+x[ii3+1]);
376 iz = _mm_set1_pd(shZ+x[ii3+2]);
378 rai = _mm_load1_pd(gb_radius+ii);
379 rai_inv = gmx_mm_inv_pd(rai);
381 sum_ai = _mm_setzero_pd();
383 sk_ai = _mm_load1_pd(born->param+ii);
384 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
386 for (k = nj0; k < nj1-1; k += 2)
394 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
395 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
396 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
398 dx = _mm_sub_pd(ix, jx);
399 dy = _mm_sub_pd(iy, jy);
400 dz = _mm_sub_pd(iz, jz);
402 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
404 rinv = gmx_mm_invsqrt_pd(rsq);
405 r = _mm_mul_pd(rsq, rinv);
407 /* Compute raj_inv aj1-4 */
408 raj_inv = gmx_mm_inv_pd(raj);
410 /* Evaluate influence of atom aj -> ai */
411 t1 = _mm_add_pd(r, sk_aj);
412 t2 = _mm_sub_pd(r, sk_aj);
413 t3 = _mm_sub_pd(sk_aj, r);
414 obc_mask1 = _mm_cmplt_pd(rai, t1);
415 obc_mask2 = _mm_cmplt_pd(rai, t2);
416 obc_mask3 = _mm_cmplt_pd(rai, t3);
418 uij = gmx_mm_inv_pd(t1);
419 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
420 _mm_andnot_pd(obc_mask2, rai_inv));
421 dlij = _mm_and_pd(one, obc_mask2);
422 uij2 = _mm_mul_pd(uij, uij);
423 uij3 = _mm_mul_pd(uij2, uij);
424 lij2 = _mm_mul_pd(lij, lij);
425 lij3 = _mm_mul_pd(lij2, lij);
427 diff2 = _mm_sub_pd(uij2, lij2);
428 lij_inv = gmx_mm_invsqrt_pd(lij2);
429 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
430 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
431 prod = _mm_mul_pd(onefourth, sk2_rinv);
433 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
435 t1 = _mm_sub_pd(lij, uij);
436 t2 = _mm_mul_pd(diff2,
437 _mm_sub_pd(_mm_mul_pd(onefourth, r),
439 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
440 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
441 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
442 t4 = _mm_and_pd(t4, obc_mask3);
443 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
445 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
447 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
448 _mm_mul_pd(prod, lij3));
450 _mm_mul_pd(onefourth,
451 _mm_add_pd(_mm_mul_pd(lij, rinv),
452 _mm_mul_pd(lij3, r))));
453 t2 = _mm_mul_pd(onefourth,
454 _mm_add_pd(_mm_mul_pd(uij, rinv),
455 _mm_mul_pd(uij3, r)));
457 _mm_add_pd(_mm_mul_pd(half, uij2),
458 _mm_mul_pd(prod, uij3)));
459 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
460 _mm_mul_pd(rinv, rinv));
462 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
464 _mm_mul_pd(sk2_rinv, rinv))));
465 t1 = _mm_mul_pd(rinv,
466 _mm_add_pd(_mm_mul_pd(dlij, t1),
467 _mm_add_pd(t2, t3)));
469 dadx1 = _mm_and_pd(t1, obc_mask1);
471 /* Evaluate influence of atom ai -> aj */
472 t1 = _mm_add_pd(r, sk_ai);
473 t2 = _mm_sub_pd(r, sk_ai);
474 t3 = _mm_sub_pd(sk_ai, r);
475 obc_mask1 = _mm_cmplt_pd(raj, t1);
476 obc_mask2 = _mm_cmplt_pd(raj, t2);
477 obc_mask3 = _mm_cmplt_pd(raj, t3);
479 uij = gmx_mm_inv_pd(t1);
480 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
481 _mm_andnot_pd(obc_mask2, raj_inv));
482 dlij = _mm_and_pd(one, obc_mask2);
483 uij2 = _mm_mul_pd(uij, uij);
484 uij3 = _mm_mul_pd(uij2, uij);
485 lij2 = _mm_mul_pd(lij, lij);
486 lij3 = _mm_mul_pd(lij2, lij);
488 diff2 = _mm_sub_pd(uij2, lij2);
489 lij_inv = gmx_mm_invsqrt_pd(lij2);
490 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
491 prod = _mm_mul_pd(onefourth, sk2_rinv);
493 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
495 t1 = _mm_sub_pd(lij, uij);
496 t2 = _mm_mul_pd(diff2,
497 _mm_sub_pd(_mm_mul_pd(onefourth, r),
499 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
500 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
501 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
502 t4 = _mm_and_pd(t4, obc_mask3);
503 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
505 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
507 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
508 _mm_mul_pd(prod, lij3));
510 _mm_mul_pd(onefourth,
511 _mm_add_pd(_mm_mul_pd(lij, rinv),
512 _mm_mul_pd(lij3, r))));
513 t2 = _mm_mul_pd(onefourth,
514 _mm_add_pd(_mm_mul_pd(uij, rinv),
515 _mm_mul_pd(uij3, r)));
517 _mm_add_pd(_mm_mul_pd(half, uij2),
518 _mm_mul_pd(prod, uij3)));
519 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
520 _mm_mul_pd(rinv, rinv));
522 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
524 _mm_mul_pd(sk2_rinv, rinv))));
525 t1 = _mm_mul_pd(rinv,
526 _mm_add_pd(_mm_mul_pd(dlij, t1),
527 _mm_add_pd(t2, t3)));
529 dadx2 = _mm_and_pd(t1, obc_mask1);
531 _mm_store_pd(dadx, dadx1);
533 _mm_store_pd(dadx, dadx2);
535 } /* end normal inner loop */
543 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
544 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
545 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
547 dx = _mm_sub_sd(ix, jx);
548 dy = _mm_sub_sd(iy, jy);
549 dz = _mm_sub_sd(iz, jz);
551 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
553 rinv = gmx_mm_invsqrt_pd(rsq);
554 r = _mm_mul_sd(rsq, rinv);
556 /* Compute raj_inv aj1-4 */
557 raj_inv = gmx_mm_inv_pd(raj);
559 /* Evaluate influence of atom aj -> ai */
560 t1 = _mm_add_sd(r, sk_aj);
561 t2 = _mm_sub_sd(r, sk_aj);
562 t3 = _mm_sub_sd(sk_aj, r);
563 obc_mask1 = _mm_cmplt_sd(rai, t1);
564 obc_mask2 = _mm_cmplt_sd(rai, t2);
565 obc_mask3 = _mm_cmplt_sd(rai, t3);
567 uij = gmx_mm_inv_pd(t1);
568 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
569 _mm_andnot_pd(obc_mask2, rai_inv));
570 dlij = _mm_and_pd(one, obc_mask2);
571 uij2 = _mm_mul_sd(uij, uij);
572 uij3 = _mm_mul_sd(uij2, uij);
573 lij2 = _mm_mul_sd(lij, lij);
574 lij3 = _mm_mul_sd(lij2, lij);
576 diff2 = _mm_sub_sd(uij2, lij2);
577 lij_inv = gmx_mm_invsqrt_pd(lij2);
578 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
579 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
580 prod = _mm_mul_sd(onefourth, sk2_rinv);
582 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
584 t1 = _mm_sub_sd(lij, uij);
585 t2 = _mm_mul_sd(diff2,
586 _mm_sub_sd(_mm_mul_pd(onefourth, r),
588 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
589 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
590 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
591 t4 = _mm_and_pd(t4, obc_mask3);
592 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
594 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
596 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
597 _mm_mul_sd(prod, lij3));
599 _mm_mul_sd(onefourth,
600 _mm_add_sd(_mm_mul_sd(lij, rinv),
601 _mm_mul_sd(lij3, r))));
602 t2 = _mm_mul_sd(onefourth,
603 _mm_add_sd(_mm_mul_sd(uij, rinv),
604 _mm_mul_sd(uij3, r)));
606 _mm_add_sd(_mm_mul_sd(half, uij2),
607 _mm_mul_sd(prod, uij3)));
608 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
609 _mm_mul_sd(rinv, rinv));
611 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
613 _mm_mul_sd(sk2_rinv, rinv))));
614 t1 = _mm_mul_sd(rinv,
615 _mm_add_sd(_mm_mul_sd(dlij, t1),
616 _mm_add_pd(t2, t3)));
618 dadx1 = _mm_and_pd(t1, obc_mask1);
620 /* Evaluate influence of atom ai -> aj */
621 t1 = _mm_add_sd(r, sk_ai);
622 t2 = _mm_sub_sd(r, sk_ai);
623 t3 = _mm_sub_sd(sk_ai, r);
624 obc_mask1 = _mm_cmplt_sd(raj, t1);
625 obc_mask2 = _mm_cmplt_sd(raj, t2);
626 obc_mask3 = _mm_cmplt_sd(raj, t3);
628 uij = gmx_mm_inv_pd(t1);
629 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
630 _mm_andnot_pd(obc_mask2, raj_inv));
631 dlij = _mm_and_pd(one, obc_mask2);
632 uij2 = _mm_mul_sd(uij, uij);
633 uij3 = _mm_mul_sd(uij2, uij);
634 lij2 = _mm_mul_sd(lij, lij);
635 lij3 = _mm_mul_sd(lij2, lij);
637 diff2 = _mm_sub_sd(uij2, lij2);
638 lij_inv = gmx_mm_invsqrt_pd(lij2);
639 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
640 prod = _mm_mul_sd(onefourth, sk2_rinv);
642 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
644 t1 = _mm_sub_sd(lij, uij);
645 t2 = _mm_mul_sd(diff2,
646 _mm_sub_sd(_mm_mul_sd(onefourth, r),
648 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
649 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
650 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
651 t4 = _mm_and_pd(t4, obc_mask3);
652 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
654 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
656 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
657 _mm_mul_sd(prod, lij3));
659 _mm_mul_sd(onefourth,
660 _mm_add_sd(_mm_mul_sd(lij, rinv),
661 _mm_mul_sd(lij3, r))));
662 t2 = _mm_mul_sd(onefourth,
663 _mm_add_sd(_mm_mul_sd(uij, rinv),
664 _mm_mul_sd(uij3, r)));
666 _mm_add_sd(_mm_mul_sd(half, uij2),
667 _mm_mul_sd(prod, uij3)));
668 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
669 _mm_mul_sd(rinv, rinv));
671 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
673 _mm_mul_sd(sk2_rinv, rinv))));
674 t1 = _mm_mul_sd(rinv,
675 _mm_add_sd(_mm_mul_sd(dlij, t1),
676 _mm_add_sd(t2, t3)));
678 dadx2 = _mm_and_pd(t1, obc_mask1);
680 _mm_store_pd(dadx, dadx1);
682 _mm_store_pd(dadx, dadx2);
685 gmx_mm_update_1pot_pd(sum_ai, work+ii);
689 /* Parallel summations */
692 gmx_sum(natoms, work, cr);
694 else if (DOMAINDECOMP(cr))
696 dd_atom_sum_real(cr->dd, work);
699 if (gb_algorithm == egbHCT)
702 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
704 if (born->use[i] != 0)
706 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
707 sum = 1.0/rr - work[i];
708 min_rad = rr + doffset;
711 born->bRad[i] = rad > min_rad ? rad : min_rad;
712 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
716 /* Extra communication required for DD */
717 if (DOMAINDECOMP(cr))
719 dd_atom_spread_real(cr->dd, born->bRad);
720 dd_atom_spread_real(cr->dd, fr->invsqrta);
726 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
728 if (born->use[i] != 0)
730 rr = top->atomtypes.gb_radius[md->typeA[i]];
738 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
739 born->bRad[i] = rr_inv - tsum*rr_inv2;
740 born->bRad[i] = 1.0 / born->bRad[i];
742 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
744 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
745 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
748 /* Extra (local) communication required for DD */
749 if (DOMAINDECOMP(cr))
751 dd_atom_spread_real(cr->dd, born->bRad);
752 dd_atom_spread_real(cr->dd, fr->invsqrta);
753 dd_atom_spread_real(cr->dd, born->drobc);
764 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
765 double *x, double *f, double *fshift, double *shiftvec,
766 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
768 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
773 double rbi, shX, shY, shZ;
778 __m128d fix, fiy, fiz;
782 __m128d rbai, rbaj, f_gb, f_gb_ai;
783 __m128d xmm1, xmm2, xmm3;
785 const __m128d two = _mm_set1_pd(2.0);
791 /* Loop to get the proper form for the Born radius term, sse style */
795 if (gb_algorithm == egbSTILL)
797 for (i = n0; i < n1; i++)
800 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
803 else if (gb_algorithm == egbHCT)
805 for (i = n0; i < n1; i++)
808 rb[i] = rbi * rbi * dvda[i];
811 else if (gb_algorithm == egbOBC)
813 for (i = n0; i < n1; i++)
816 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
820 jz = _mm_setzero_pd();
824 for (i = 0; i < nl->nri; i++)
828 is3 = 3*nl->shift[i];
830 shY = shiftvec[is3+1];
831 shZ = shiftvec[is3+2];
833 nj1 = nl->jindex[i+1];
835 ix = _mm_set1_pd(shX+x[ii3+0]);
836 iy = _mm_set1_pd(shY+x[ii3+1]);
837 iz = _mm_set1_pd(shZ+x[ii3+2]);
839 rbai = _mm_load1_pd(rb+ii);
840 fix = _mm_setzero_pd();
841 fiy = _mm_setzero_pd();
842 fiz = _mm_setzero_pd();
845 for (k = nj0; k < nj1-1; k += 2)
853 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
855 dx = _mm_sub_pd(ix, jx);
856 dy = _mm_sub_pd(iy, jy);
857 dz = _mm_sub_pd(iz, jz);
859 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
861 /* load chain rule terms for j1-4 */
862 f_gb = _mm_load_pd(dadx);
864 f_gb_ai = _mm_load_pd(dadx);
867 /* calculate scalar force */
868 f_gb = _mm_mul_pd(f_gb, rbai);
869 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
870 f_gb = _mm_add_pd(f_gb, f_gb_ai);
872 tx = _mm_mul_pd(f_gb, dx);
873 ty = _mm_mul_pd(f_gb, dy);
874 tz = _mm_mul_pd(f_gb, dz);
876 fix = _mm_add_pd(fix, tx);
877 fiy = _mm_add_pd(fiy, ty);
878 fiz = _mm_add_pd(fiz, tz);
880 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
883 /*deal with odd elements */
889 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
891 dx = _mm_sub_sd(ix, jx);
892 dy = _mm_sub_sd(iy, jy);
893 dz = _mm_sub_sd(iz, jz);
895 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
897 /* load chain rule terms */
898 f_gb = _mm_load_pd(dadx);
900 f_gb_ai = _mm_load_pd(dadx);
903 /* calculate scalar force */
904 f_gb = _mm_mul_sd(f_gb, rbai);
905 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
906 f_gb = _mm_add_sd(f_gb, f_gb_ai);
908 tx = _mm_mul_sd(f_gb, dx);
909 ty = _mm_mul_sd(f_gb, dy);
910 tz = _mm_mul_sd(f_gb, dz);
912 fix = _mm_add_sd(fix, tx);
913 fiy = _mm_add_sd(fiy, ty);
914 fiz = _mm_add_sd(fiz, tz);
916 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
919 /* fix/fiy/fiz now contain four partial force terms, that all should be
920 * added to the i particle forces and shift forces.
922 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
929 /* keep compiler happy */
930 int genborn_sse2_dummy;
932 #endif /* SSE2 intrinsics available */