2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/legacyheaders/typedefs.h"
43 #include "gromacs/utility/smalloc.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/fileio/pdbio.h"
47 #include "gromacs/legacyheaders/names.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/domdec.h"
50 #include "gromacs/legacyheaders/network.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/legacyheaders/genborn.h"
54 #include "gromacs/utility/gmxmpi.h"
56 /* Only compile this file if SSE2 intrinsics are available */
57 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
58 #include <gmx_sse2_double.h>
59 #include <emmintrin.h>
61 #include "genborn_sse2_double.h"
64 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
65 int natoms, gmx_localtop_t *top,
66 double *x, t_nblist *nl,
69 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
70 int jnrA, jnrB, j3A, j3B;
87 __m128d rsq, rinv, rinv2, rinv4, rinv6;
88 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
89 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
90 __m128d mask, icf4, icf6, mask_cmp;
92 const __m128d half = _mm_set1_pd(0.5);
93 const __m128d three = _mm_set1_pd(3.0);
94 const __m128d one = _mm_set1_pd(1.0);
95 const __m128d two = _mm_set1_pd(2.0);
96 const __m128d zero = _mm_set1_pd(0.0);
97 const __m128d four = _mm_set1_pd(4.0);
99 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
100 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
101 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
103 factor = 0.5 * ONE_4PI_EPS0;
105 gb_radius = born->gb_radius;
107 work = born->gpol_still_work;
109 shiftvec = fr->shift_vec[0];
113 jx = _mm_setzero_pd();
114 jy = _mm_setzero_pd();
115 jz = _mm_setzero_pd();
119 for (i = 0; i < natoms; i++)
124 for (i = 0; i < nl->nri; i++)
128 is3 = 3*nl->shift[i];
130 shY = shiftvec[is3+1];
131 shZ = shiftvec[is3+2];
133 nj1 = nl->jindex[i+1];
135 ix = _mm_set1_pd(shX+x[ii3+0]);
136 iy = _mm_set1_pd(shY+x[ii3+1]);
137 iz = _mm_set1_pd(shZ+x[ii3+2]);
140 /* Polarization energy for atom ai */
141 gpi = _mm_setzero_pd();
143 rai = _mm_load1_pd(gb_radius+ii);
144 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
146 for (k = nj0; k < nj1-1; k += 2)
154 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
156 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
157 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
159 dx = _mm_sub_pd(ix, jx);
160 dy = _mm_sub_pd(iy, jy);
161 dz = _mm_sub_pd(iz, jz);
163 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
164 rinv = gmx_mm_invsqrt_pd(rsq);
165 rinv2 = _mm_mul_pd(rinv, rinv);
166 rinv4 = _mm_mul_pd(rinv2, rinv2);
167 rinv6 = _mm_mul_pd(rinv4, rinv2);
169 rvdw = _mm_add_pd(rai, raj);
170 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
172 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
174 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
175 if (0 == _mm_movemask_pd(mask_cmp) )
177 /* if ratio>still_p5inv for ALL elements */
179 dccf = _mm_setzero_pd();
183 ratio = _mm_min_pd(ratio, still_p5inv);
184 theta = _mm_mul_pd(ratio, still_pip5);
185 gmx_mm_sincos_pd(theta, &sinq, &cosq);
186 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
187 ccf = _mm_mul_pd(term, term);
188 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
189 _mm_mul_pd(sinq, theta));
192 prod = _mm_mul_pd(still_p4, vaj);
193 icf4 = _mm_mul_pd(ccf, rinv4);
194 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
196 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
198 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
200 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
202 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
212 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
214 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
215 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
217 dx = _mm_sub_sd(ix, jx);
218 dy = _mm_sub_sd(iy, jy);
219 dz = _mm_sub_sd(iz, jz);
221 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
222 rinv = gmx_mm_invsqrt_pd(rsq);
223 rinv2 = _mm_mul_sd(rinv, rinv);
224 rinv4 = _mm_mul_sd(rinv2, rinv2);
225 rinv6 = _mm_mul_sd(rinv4, rinv2);
227 rvdw = _mm_add_sd(rai, raj);
228 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
230 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
232 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
233 if (0 == _mm_movemask_pd(mask_cmp) )
235 /* if ratio>still_p5inv for ALL elements */
237 dccf = _mm_setzero_pd();
241 ratio = _mm_min_sd(ratio, still_p5inv);
242 theta = _mm_mul_sd(ratio, still_pip5);
243 gmx_mm_sincos_pd(theta, &sinq, &cosq);
244 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
245 ccf = _mm_mul_sd(term, term);
246 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
247 _mm_mul_sd(sinq, theta));
250 prod = _mm_mul_sd(still_p4, vaj);
251 icf4 = _mm_mul_sd(ccf, rinv4);
252 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
254 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
256 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
258 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
260 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
263 gmx_mm_update_1pot_pd(gpi, work+ii);
266 /* Sum up the polarization energy from other nodes */
267 if (DOMAINDECOMP(cr))
269 dd_atom_sum_real(cr->dd, work);
272 /* Compute the radii */
273 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
275 if (born->use[i] != 0)
277 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
278 gpi2 = gpi_ai * gpi_ai;
279 born->bRad[i] = factor*gmx_invsqrt(gpi2);
280 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
284 /* Extra (local) communication required for DD */
285 if (DOMAINDECOMP(cr))
287 dd_atom_spread_real(cr->dd, born->bRad);
288 dd_atom_spread_real(cr->dd, fr->invsqrta);
296 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
297 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
299 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
302 double shX, shY, shZ;
303 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
304 double sum_ai2, sum_ai3, tsum, tchain, doffset;
313 __m128d ix, iy, iz, jx, jy, jz;
314 __m128d dx, dy, dz, t1, t2, t3, t4;
315 __m128d rsq, rinv, r;
316 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
317 __m128d uij, lij2, uij2, lij3, uij3, diff2;
318 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
319 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
320 __m128d dadx1, dadx2;
323 __m128d obc_mask1, obc_mask2, obc_mask3;
325 __m128d oneeighth = _mm_set1_pd(0.125);
326 __m128d onefourth = _mm_set1_pd(0.25);
328 const __m128d half = _mm_set1_pd(0.5);
329 const __m128d three = _mm_set1_pd(3.0);
330 const __m128d one = _mm_set1_pd(1.0);
331 const __m128d two = _mm_set1_pd(2.0);
332 const __m128d zero = _mm_set1_pd(0.0);
333 const __m128d neg = _mm_set1_pd(-1.0);
335 /* Set the dielectric offset */
336 doffset = born->gb_doffset;
337 gb_radius = born->gb_radius;
338 obc_param = born->param;
339 work = born->gpol_hct_work;
342 shiftvec = fr->shift_vec[0];
344 jx = _mm_setzero_pd();
345 jy = _mm_setzero_pd();
346 jz = _mm_setzero_pd();
350 for (i = 0; i < born->nr; i++)
355 for (i = 0; i < nl->nri; i++)
359 is3 = 3*nl->shift[i];
361 shY = shiftvec[is3+1];
362 shZ = shiftvec[is3+2];
364 nj1 = nl->jindex[i+1];
366 ix = _mm_set1_pd(shX+x[ii3+0]);
367 iy = _mm_set1_pd(shY+x[ii3+1]);
368 iz = _mm_set1_pd(shZ+x[ii3+2]);
370 rai = _mm_load1_pd(gb_radius+ii);
371 rai_inv = gmx_mm_inv_pd(rai);
373 sum_ai = _mm_setzero_pd();
375 sk_ai = _mm_load1_pd(born->param+ii);
376 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
378 for (k = nj0; k < nj1-1; k += 2)
386 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
387 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
388 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
390 dx = _mm_sub_pd(ix, jx);
391 dy = _mm_sub_pd(iy, jy);
392 dz = _mm_sub_pd(iz, jz);
394 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
396 rinv = gmx_mm_invsqrt_pd(rsq);
397 r = _mm_mul_pd(rsq, rinv);
399 /* Compute raj_inv aj1-4 */
400 raj_inv = gmx_mm_inv_pd(raj);
402 /* Evaluate influence of atom aj -> ai */
403 t1 = _mm_add_pd(r, sk_aj);
404 t2 = _mm_sub_pd(r, sk_aj);
405 t3 = _mm_sub_pd(sk_aj, r);
406 obc_mask1 = _mm_cmplt_pd(rai, t1);
407 obc_mask2 = _mm_cmplt_pd(rai, t2);
408 obc_mask3 = _mm_cmplt_pd(rai, t3);
410 uij = gmx_mm_inv_pd(t1);
411 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
412 _mm_andnot_pd(obc_mask2, rai_inv));
413 dlij = _mm_and_pd(one, obc_mask2);
414 uij2 = _mm_mul_pd(uij, uij);
415 uij3 = _mm_mul_pd(uij2, uij);
416 lij2 = _mm_mul_pd(lij, lij);
417 lij3 = _mm_mul_pd(lij2, lij);
419 diff2 = _mm_sub_pd(uij2, lij2);
420 lij_inv = gmx_mm_invsqrt_pd(lij2);
421 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
422 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
423 prod = _mm_mul_pd(onefourth, sk2_rinv);
425 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
427 t1 = _mm_sub_pd(lij, uij);
428 t2 = _mm_mul_pd(diff2,
429 _mm_sub_pd(_mm_mul_pd(onefourth, r),
431 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
432 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
433 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
434 t4 = _mm_and_pd(t4, obc_mask3);
435 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
437 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
439 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
440 _mm_mul_pd(prod, lij3));
442 _mm_mul_pd(onefourth,
443 _mm_add_pd(_mm_mul_pd(lij, rinv),
444 _mm_mul_pd(lij3, r))));
445 t2 = _mm_mul_pd(onefourth,
446 _mm_add_pd(_mm_mul_pd(uij, rinv),
447 _mm_mul_pd(uij3, r)));
449 _mm_add_pd(_mm_mul_pd(half, uij2),
450 _mm_mul_pd(prod, uij3)));
451 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
452 _mm_mul_pd(rinv, rinv));
454 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
456 _mm_mul_pd(sk2_rinv, rinv))));
457 t1 = _mm_mul_pd(rinv,
458 _mm_add_pd(_mm_mul_pd(dlij, t1),
459 _mm_add_pd(t2, t3)));
461 dadx1 = _mm_and_pd(t1, obc_mask1);
463 /* Evaluate influence of atom ai -> aj */
464 t1 = _mm_add_pd(r, sk_ai);
465 t2 = _mm_sub_pd(r, sk_ai);
466 t3 = _mm_sub_pd(sk_ai, r);
467 obc_mask1 = _mm_cmplt_pd(raj, t1);
468 obc_mask2 = _mm_cmplt_pd(raj, t2);
469 obc_mask3 = _mm_cmplt_pd(raj, t3);
471 uij = gmx_mm_inv_pd(t1);
472 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
473 _mm_andnot_pd(obc_mask2, raj_inv));
474 dlij = _mm_and_pd(one, obc_mask2);
475 uij2 = _mm_mul_pd(uij, uij);
476 uij3 = _mm_mul_pd(uij2, uij);
477 lij2 = _mm_mul_pd(lij, lij);
478 lij3 = _mm_mul_pd(lij2, lij);
480 diff2 = _mm_sub_pd(uij2, lij2);
481 lij_inv = gmx_mm_invsqrt_pd(lij2);
482 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
483 prod = _mm_mul_pd(onefourth, sk2_rinv);
485 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
487 t1 = _mm_sub_pd(lij, uij);
488 t2 = _mm_mul_pd(diff2,
489 _mm_sub_pd(_mm_mul_pd(onefourth, r),
491 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
492 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
493 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
494 t4 = _mm_and_pd(t4, obc_mask3);
495 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
497 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
499 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
500 _mm_mul_pd(prod, lij3));
502 _mm_mul_pd(onefourth,
503 _mm_add_pd(_mm_mul_pd(lij, rinv),
504 _mm_mul_pd(lij3, r))));
505 t2 = _mm_mul_pd(onefourth,
506 _mm_add_pd(_mm_mul_pd(uij, rinv),
507 _mm_mul_pd(uij3, r)));
509 _mm_add_pd(_mm_mul_pd(half, uij2),
510 _mm_mul_pd(prod, uij3)));
511 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
512 _mm_mul_pd(rinv, rinv));
514 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
516 _mm_mul_pd(sk2_rinv, rinv))));
517 t1 = _mm_mul_pd(rinv,
518 _mm_add_pd(_mm_mul_pd(dlij, t1),
519 _mm_add_pd(t2, t3)));
521 dadx2 = _mm_and_pd(t1, obc_mask1);
523 _mm_store_pd(dadx, dadx1);
525 _mm_store_pd(dadx, dadx2);
527 } /* end normal inner loop */
535 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
536 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
537 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
539 dx = _mm_sub_sd(ix, jx);
540 dy = _mm_sub_sd(iy, jy);
541 dz = _mm_sub_sd(iz, jz);
543 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
545 rinv = gmx_mm_invsqrt_pd(rsq);
546 r = _mm_mul_sd(rsq, rinv);
548 /* Compute raj_inv aj1-4 */
549 raj_inv = gmx_mm_inv_pd(raj);
551 /* Evaluate influence of atom aj -> ai */
552 t1 = _mm_add_sd(r, sk_aj);
553 t2 = _mm_sub_sd(r, sk_aj);
554 t3 = _mm_sub_sd(sk_aj, r);
555 obc_mask1 = _mm_cmplt_sd(rai, t1);
556 obc_mask2 = _mm_cmplt_sd(rai, t2);
557 obc_mask3 = _mm_cmplt_sd(rai, t3);
559 uij = gmx_mm_inv_pd(t1);
560 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
561 _mm_andnot_pd(obc_mask2, rai_inv));
562 dlij = _mm_and_pd(one, obc_mask2);
563 uij2 = _mm_mul_sd(uij, uij);
564 uij3 = _mm_mul_sd(uij2, uij);
565 lij2 = _mm_mul_sd(lij, lij);
566 lij3 = _mm_mul_sd(lij2, lij);
568 diff2 = _mm_sub_sd(uij2, lij2);
569 lij_inv = gmx_mm_invsqrt_pd(lij2);
570 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
571 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
572 prod = _mm_mul_sd(onefourth, sk2_rinv);
574 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
576 t1 = _mm_sub_sd(lij, uij);
577 t2 = _mm_mul_sd(diff2,
578 _mm_sub_sd(_mm_mul_pd(onefourth, r),
580 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
581 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
582 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
583 t4 = _mm_and_pd(t4, obc_mask3);
584 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
586 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
588 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
589 _mm_mul_sd(prod, lij3));
591 _mm_mul_sd(onefourth,
592 _mm_add_sd(_mm_mul_sd(lij, rinv),
593 _mm_mul_sd(lij3, r))));
594 t2 = _mm_mul_sd(onefourth,
595 _mm_add_sd(_mm_mul_sd(uij, rinv),
596 _mm_mul_sd(uij3, r)));
598 _mm_add_sd(_mm_mul_sd(half, uij2),
599 _mm_mul_sd(prod, uij3)));
600 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
601 _mm_mul_sd(rinv, rinv));
603 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
605 _mm_mul_sd(sk2_rinv, rinv))));
606 t1 = _mm_mul_sd(rinv,
607 _mm_add_sd(_mm_mul_sd(dlij, t1),
608 _mm_add_pd(t2, t3)));
610 dadx1 = _mm_and_pd(t1, obc_mask1);
612 /* Evaluate influence of atom ai -> aj */
613 t1 = _mm_add_sd(r, sk_ai);
614 t2 = _mm_sub_sd(r, sk_ai);
615 t3 = _mm_sub_sd(sk_ai, r);
616 obc_mask1 = _mm_cmplt_sd(raj, t1);
617 obc_mask2 = _mm_cmplt_sd(raj, t2);
618 obc_mask3 = _mm_cmplt_sd(raj, t3);
620 uij = gmx_mm_inv_pd(t1);
621 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
622 _mm_andnot_pd(obc_mask2, raj_inv));
623 dlij = _mm_and_pd(one, obc_mask2);
624 uij2 = _mm_mul_sd(uij, uij);
625 uij3 = _mm_mul_sd(uij2, uij);
626 lij2 = _mm_mul_sd(lij, lij);
627 lij3 = _mm_mul_sd(lij2, lij);
629 diff2 = _mm_sub_sd(uij2, lij2);
630 lij_inv = gmx_mm_invsqrt_pd(lij2);
631 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
632 prod = _mm_mul_sd(onefourth, sk2_rinv);
634 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
636 t1 = _mm_sub_sd(lij, uij);
637 t2 = _mm_mul_sd(diff2,
638 _mm_sub_sd(_mm_mul_sd(onefourth, r),
640 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
641 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
642 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
643 t4 = _mm_and_pd(t4, obc_mask3);
644 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
646 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
648 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
649 _mm_mul_sd(prod, lij3));
651 _mm_mul_sd(onefourth,
652 _mm_add_sd(_mm_mul_sd(lij, rinv),
653 _mm_mul_sd(lij3, r))));
654 t2 = _mm_mul_sd(onefourth,
655 _mm_add_sd(_mm_mul_sd(uij, rinv),
656 _mm_mul_sd(uij3, r)));
658 _mm_add_sd(_mm_mul_sd(half, uij2),
659 _mm_mul_sd(prod, uij3)));
660 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
661 _mm_mul_sd(rinv, rinv));
663 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
665 _mm_mul_sd(sk2_rinv, rinv))));
666 t1 = _mm_mul_sd(rinv,
667 _mm_add_sd(_mm_mul_sd(dlij, t1),
668 _mm_add_sd(t2, t3)));
670 dadx2 = _mm_and_pd(t1, obc_mask1);
672 _mm_store_pd(dadx, dadx1);
674 _mm_store_pd(dadx, dadx2);
677 gmx_mm_update_1pot_pd(sum_ai, work+ii);
681 /* Parallel summations */
682 if (DOMAINDECOMP(cr))
684 dd_atom_sum_real(cr->dd, work);
687 if (gb_algorithm == egbHCT)
690 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
692 if (born->use[i] != 0)
694 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
695 sum = 1.0/rr - work[i];
696 min_rad = rr + doffset;
699 born->bRad[i] = rad > min_rad ? rad : min_rad;
700 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
704 /* Extra communication required for DD */
705 if (DOMAINDECOMP(cr))
707 dd_atom_spread_real(cr->dd, born->bRad);
708 dd_atom_spread_real(cr->dd, fr->invsqrta);
714 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
716 if (born->use[i] != 0)
718 rr = top->atomtypes.gb_radius[md->typeA[i]];
726 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
727 born->bRad[i] = rr_inv - tsum*rr_inv2;
728 born->bRad[i] = 1.0 / born->bRad[i];
730 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
732 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
733 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
736 /* Extra (local) communication required for DD */
737 if (DOMAINDECOMP(cr))
739 dd_atom_spread_real(cr->dd, born->bRad);
740 dd_atom_spread_real(cr->dd, fr->invsqrta);
741 dd_atom_spread_real(cr->dd, born->drobc);
752 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
753 double *x, double *f, double *fshift, double *shiftvec,
754 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
756 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
761 double rbi, shX, shY, shZ;
766 __m128d fix, fiy, fiz;
770 __m128d rbai, rbaj, f_gb, f_gb_ai;
771 __m128d xmm1, xmm2, xmm3;
773 const __m128d two = _mm_set1_pd(2.0);
779 /* Loop to get the proper form for the Born radius term, sse style */
783 if (gb_algorithm == egbSTILL)
785 for (i = n0; i < n1; i++)
788 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
791 else if (gb_algorithm == egbHCT)
793 for (i = n0; i < n1; i++)
796 rb[i] = rbi * rbi * dvda[i];
799 else if (gb_algorithm == egbOBC)
801 for (i = n0; i < n1; i++)
804 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
808 jz = _mm_setzero_pd();
812 for (i = 0; i < nl->nri; i++)
816 is3 = 3*nl->shift[i];
818 shY = shiftvec[is3+1];
819 shZ = shiftvec[is3+2];
821 nj1 = nl->jindex[i+1];
823 ix = _mm_set1_pd(shX+x[ii3+0]);
824 iy = _mm_set1_pd(shY+x[ii3+1]);
825 iz = _mm_set1_pd(shZ+x[ii3+2]);
827 rbai = _mm_load1_pd(rb+ii);
828 fix = _mm_setzero_pd();
829 fiy = _mm_setzero_pd();
830 fiz = _mm_setzero_pd();
833 for (k = nj0; k < nj1-1; k += 2)
841 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
843 dx = _mm_sub_pd(ix, jx);
844 dy = _mm_sub_pd(iy, jy);
845 dz = _mm_sub_pd(iz, jz);
847 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
849 /* load chain rule terms for j1-4 */
850 f_gb = _mm_load_pd(dadx);
852 f_gb_ai = _mm_load_pd(dadx);
855 /* calculate scalar force */
856 f_gb = _mm_mul_pd(f_gb, rbai);
857 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
858 f_gb = _mm_add_pd(f_gb, f_gb_ai);
860 tx = _mm_mul_pd(f_gb, dx);
861 ty = _mm_mul_pd(f_gb, dy);
862 tz = _mm_mul_pd(f_gb, dz);
864 fix = _mm_add_pd(fix, tx);
865 fiy = _mm_add_pd(fiy, ty);
866 fiz = _mm_add_pd(fiz, tz);
868 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
871 /*deal with odd elements */
877 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
879 dx = _mm_sub_sd(ix, jx);
880 dy = _mm_sub_sd(iy, jy);
881 dz = _mm_sub_sd(iz, jz);
883 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
885 /* load chain rule terms */
886 f_gb = _mm_load_pd(dadx);
888 f_gb_ai = _mm_load_pd(dadx);
891 /* calculate scalar force */
892 f_gb = _mm_mul_sd(f_gb, rbai);
893 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
894 f_gb = _mm_add_sd(f_gb, f_gb_ai);
896 tx = _mm_mul_sd(f_gb, dx);
897 ty = _mm_mul_sd(f_gb, dy);
898 tz = _mm_mul_sd(f_gb, dz);
900 fix = _mm_add_sd(fix, tx);
901 fiy = _mm_add_sd(fiy, ty);
902 fiz = _mm_add_sd(fiz, tz);
904 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
907 /* fix/fiy/fiz now contain four partial force terms, that all should be
908 * added to the i particle forces and shift forces.
910 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
917 /* keep compiler happy */
918 int genborn_sse2_dummy;
920 #endif /* SSE2 intrinsics available */