2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
44 #include "gromacs/legacyheaders/typedefs.h"
45 #include "gromacs/utility/smalloc.h"
46 #include "gromacs/legacyheaders/genborn.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/fileio/pdbio.h"
49 #include "gromacs/legacyheaders/names.h"
50 #include "gromacs/math/units.h"
51 #include "gromacs/legacyheaders/domdec.h"
52 #include "gromacs/legacyheaders/network.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "gromacs/legacyheaders/genborn.h"
56 #include "gromacs/utility/gmxmpi.h"
58 /* Only compile this file if SSE2 intrinsics are available */
59 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
60 #include <gmx_sse2_double.h>
61 #include <emmintrin.h>
63 #include "genborn_sse2_double.h"
66 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
67 int natoms, gmx_localtop_t *top,
68 double *x, t_nblist *nl,
71 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
72 int jnrA, jnrB, j3A, j3B;
89 __m128d rsq, rinv, rinv2, rinv4, rinv6;
90 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
91 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
92 __m128d mask, icf4, icf6, mask_cmp;
94 const __m128d half = _mm_set1_pd(0.5);
95 const __m128d three = _mm_set1_pd(3.0);
96 const __m128d one = _mm_set1_pd(1.0);
97 const __m128d two = _mm_set1_pd(2.0);
98 const __m128d zero = _mm_set1_pd(0.0);
99 const __m128d four = _mm_set1_pd(4.0);
101 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
102 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
103 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
105 factor = 0.5 * ONE_4PI_EPS0;
107 gb_radius = born->gb_radius;
109 work = born->gpol_still_work;
111 shiftvec = fr->shift_vec[0];
115 jx = _mm_setzero_pd();
116 jy = _mm_setzero_pd();
117 jz = _mm_setzero_pd();
121 for (i = 0; i < natoms; i++)
126 for (i = 0; i < nl->nri; i++)
130 is3 = 3*nl->shift[i];
132 shY = shiftvec[is3+1];
133 shZ = shiftvec[is3+2];
135 nj1 = nl->jindex[i+1];
137 ix = _mm_set1_pd(shX+x[ii3+0]);
138 iy = _mm_set1_pd(shY+x[ii3+1]);
139 iz = _mm_set1_pd(shZ+x[ii3+2]);
142 /* Polarization energy for atom ai */
143 gpi = _mm_setzero_pd();
145 rai = _mm_load1_pd(gb_radius+ii);
146 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
148 for (k = nj0; k < nj1-1; k += 2)
156 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
158 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
159 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
161 dx = _mm_sub_pd(ix, jx);
162 dy = _mm_sub_pd(iy, jy);
163 dz = _mm_sub_pd(iz, jz);
165 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
166 rinv = gmx_mm_invsqrt_pd(rsq);
167 rinv2 = _mm_mul_pd(rinv, rinv);
168 rinv4 = _mm_mul_pd(rinv2, rinv2);
169 rinv6 = _mm_mul_pd(rinv4, rinv2);
171 rvdw = _mm_add_pd(rai, raj);
172 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
174 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
176 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
177 if (0 == _mm_movemask_pd(mask_cmp) )
179 /* if ratio>still_p5inv for ALL elements */
181 dccf = _mm_setzero_pd();
185 ratio = _mm_min_pd(ratio, still_p5inv);
186 theta = _mm_mul_pd(ratio, still_pip5);
187 gmx_mm_sincos_pd(theta, &sinq, &cosq);
188 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
189 ccf = _mm_mul_pd(term, term);
190 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
191 _mm_mul_pd(sinq, theta));
194 prod = _mm_mul_pd(still_p4, vaj);
195 icf4 = _mm_mul_pd(ccf, rinv4);
196 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
198 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
200 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
202 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
204 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
214 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
216 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
217 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
219 dx = _mm_sub_sd(ix, jx);
220 dy = _mm_sub_sd(iy, jy);
221 dz = _mm_sub_sd(iz, jz);
223 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
224 rinv = gmx_mm_invsqrt_pd(rsq);
225 rinv2 = _mm_mul_sd(rinv, rinv);
226 rinv4 = _mm_mul_sd(rinv2, rinv2);
227 rinv6 = _mm_mul_sd(rinv4, rinv2);
229 rvdw = _mm_add_sd(rai, raj);
230 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
232 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
234 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
235 if (0 == _mm_movemask_pd(mask_cmp) )
237 /* if ratio>still_p5inv for ALL elements */
239 dccf = _mm_setzero_pd();
243 ratio = _mm_min_sd(ratio, still_p5inv);
244 theta = _mm_mul_sd(ratio, still_pip5);
245 gmx_mm_sincos_pd(theta, &sinq, &cosq);
246 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
247 ccf = _mm_mul_sd(term, term);
248 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
249 _mm_mul_sd(sinq, theta));
252 prod = _mm_mul_sd(still_p4, vaj);
253 icf4 = _mm_mul_sd(ccf, rinv4);
254 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
256 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
258 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
260 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
262 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
265 gmx_mm_update_1pot_pd(gpi, work+ii);
268 /* Sum up the polarization energy from other nodes */
269 if (DOMAINDECOMP(cr))
271 dd_atom_sum_real(cr->dd, work);
274 /* Compute the radii */
275 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
277 if (born->use[i] != 0)
279 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
280 gpi2 = gpi_ai * gpi_ai;
281 born->bRad[i] = factor*gmx_invsqrt(gpi2);
282 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
286 /* Extra (local) communication required for DD */
287 if (DOMAINDECOMP(cr))
289 dd_atom_spread_real(cr->dd, born->bRad);
290 dd_atom_spread_real(cr->dd, fr->invsqrta);
298 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
299 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
301 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
304 double shX, shY, shZ;
305 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
306 double sum_ai2, sum_ai3, tsum, tchain, doffset;
315 __m128d ix, iy, iz, jx, jy, jz;
316 __m128d dx, dy, dz, t1, t2, t3, t4;
317 __m128d rsq, rinv, r;
318 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
319 __m128d uij, lij2, uij2, lij3, uij3, diff2;
320 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
321 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
322 __m128d dadx1, dadx2;
325 __m128d obc_mask1, obc_mask2, obc_mask3;
327 __m128d oneeighth = _mm_set1_pd(0.125);
328 __m128d onefourth = _mm_set1_pd(0.25);
330 const __m128d half = _mm_set1_pd(0.5);
331 const __m128d three = _mm_set1_pd(3.0);
332 const __m128d one = _mm_set1_pd(1.0);
333 const __m128d two = _mm_set1_pd(2.0);
334 const __m128d zero = _mm_set1_pd(0.0);
335 const __m128d neg = _mm_set1_pd(-1.0);
337 /* Set the dielectric offset */
338 doffset = born->gb_doffset;
339 gb_radius = born->gb_radius;
340 obc_param = born->param;
341 work = born->gpol_hct_work;
344 shiftvec = fr->shift_vec[0];
346 jx = _mm_setzero_pd();
347 jy = _mm_setzero_pd();
348 jz = _mm_setzero_pd();
352 for (i = 0; i < born->nr; i++)
357 for (i = 0; i < nl->nri; i++)
361 is3 = 3*nl->shift[i];
363 shY = shiftvec[is3+1];
364 shZ = shiftvec[is3+2];
366 nj1 = nl->jindex[i+1];
368 ix = _mm_set1_pd(shX+x[ii3+0]);
369 iy = _mm_set1_pd(shY+x[ii3+1]);
370 iz = _mm_set1_pd(shZ+x[ii3+2]);
372 rai = _mm_load1_pd(gb_radius+ii);
373 rai_inv = gmx_mm_inv_pd(rai);
375 sum_ai = _mm_setzero_pd();
377 sk_ai = _mm_load1_pd(born->param+ii);
378 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
380 for (k = nj0; k < nj1-1; k += 2)
388 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
389 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
390 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
392 dx = _mm_sub_pd(ix, jx);
393 dy = _mm_sub_pd(iy, jy);
394 dz = _mm_sub_pd(iz, jz);
396 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
398 rinv = gmx_mm_invsqrt_pd(rsq);
399 r = _mm_mul_pd(rsq, rinv);
401 /* Compute raj_inv aj1-4 */
402 raj_inv = gmx_mm_inv_pd(raj);
404 /* Evaluate influence of atom aj -> ai */
405 t1 = _mm_add_pd(r, sk_aj);
406 t2 = _mm_sub_pd(r, sk_aj);
407 t3 = _mm_sub_pd(sk_aj, r);
408 obc_mask1 = _mm_cmplt_pd(rai, t1);
409 obc_mask2 = _mm_cmplt_pd(rai, t2);
410 obc_mask3 = _mm_cmplt_pd(rai, t3);
412 uij = gmx_mm_inv_pd(t1);
413 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
414 _mm_andnot_pd(obc_mask2, rai_inv));
415 dlij = _mm_and_pd(one, obc_mask2);
416 uij2 = _mm_mul_pd(uij, uij);
417 uij3 = _mm_mul_pd(uij2, uij);
418 lij2 = _mm_mul_pd(lij, lij);
419 lij3 = _mm_mul_pd(lij2, lij);
421 diff2 = _mm_sub_pd(uij2, lij2);
422 lij_inv = gmx_mm_invsqrt_pd(lij2);
423 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
424 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
425 prod = _mm_mul_pd(onefourth, sk2_rinv);
427 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
429 t1 = _mm_sub_pd(lij, uij);
430 t2 = _mm_mul_pd(diff2,
431 _mm_sub_pd(_mm_mul_pd(onefourth, r),
433 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
434 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
435 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
436 t4 = _mm_and_pd(t4, obc_mask3);
437 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
439 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
441 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
442 _mm_mul_pd(prod, lij3));
444 _mm_mul_pd(onefourth,
445 _mm_add_pd(_mm_mul_pd(lij, rinv),
446 _mm_mul_pd(lij3, r))));
447 t2 = _mm_mul_pd(onefourth,
448 _mm_add_pd(_mm_mul_pd(uij, rinv),
449 _mm_mul_pd(uij3, r)));
451 _mm_add_pd(_mm_mul_pd(half, uij2),
452 _mm_mul_pd(prod, uij3)));
453 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
454 _mm_mul_pd(rinv, rinv));
456 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
458 _mm_mul_pd(sk2_rinv, rinv))));
459 t1 = _mm_mul_pd(rinv,
460 _mm_add_pd(_mm_mul_pd(dlij, t1),
461 _mm_add_pd(t2, t3)));
463 dadx1 = _mm_and_pd(t1, obc_mask1);
465 /* Evaluate influence of atom ai -> aj */
466 t1 = _mm_add_pd(r, sk_ai);
467 t2 = _mm_sub_pd(r, sk_ai);
468 t3 = _mm_sub_pd(sk_ai, r);
469 obc_mask1 = _mm_cmplt_pd(raj, t1);
470 obc_mask2 = _mm_cmplt_pd(raj, t2);
471 obc_mask3 = _mm_cmplt_pd(raj, t3);
473 uij = gmx_mm_inv_pd(t1);
474 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
475 _mm_andnot_pd(obc_mask2, raj_inv));
476 dlij = _mm_and_pd(one, obc_mask2);
477 uij2 = _mm_mul_pd(uij, uij);
478 uij3 = _mm_mul_pd(uij2, uij);
479 lij2 = _mm_mul_pd(lij, lij);
480 lij3 = _mm_mul_pd(lij2, lij);
482 diff2 = _mm_sub_pd(uij2, lij2);
483 lij_inv = gmx_mm_invsqrt_pd(lij2);
484 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
485 prod = _mm_mul_pd(onefourth, sk2_rinv);
487 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
489 t1 = _mm_sub_pd(lij, uij);
490 t2 = _mm_mul_pd(diff2,
491 _mm_sub_pd(_mm_mul_pd(onefourth, r),
493 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
494 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
495 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
496 t4 = _mm_and_pd(t4, obc_mask3);
497 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
499 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
501 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
502 _mm_mul_pd(prod, lij3));
504 _mm_mul_pd(onefourth,
505 _mm_add_pd(_mm_mul_pd(lij, rinv),
506 _mm_mul_pd(lij3, r))));
507 t2 = _mm_mul_pd(onefourth,
508 _mm_add_pd(_mm_mul_pd(uij, rinv),
509 _mm_mul_pd(uij3, r)));
511 _mm_add_pd(_mm_mul_pd(half, uij2),
512 _mm_mul_pd(prod, uij3)));
513 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
514 _mm_mul_pd(rinv, rinv));
516 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
518 _mm_mul_pd(sk2_rinv, rinv))));
519 t1 = _mm_mul_pd(rinv,
520 _mm_add_pd(_mm_mul_pd(dlij, t1),
521 _mm_add_pd(t2, t3)));
523 dadx2 = _mm_and_pd(t1, obc_mask1);
525 _mm_store_pd(dadx, dadx1);
527 _mm_store_pd(dadx, dadx2);
529 } /* end normal inner loop */
537 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
538 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
539 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
541 dx = _mm_sub_sd(ix, jx);
542 dy = _mm_sub_sd(iy, jy);
543 dz = _mm_sub_sd(iz, jz);
545 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
547 rinv = gmx_mm_invsqrt_pd(rsq);
548 r = _mm_mul_sd(rsq, rinv);
550 /* Compute raj_inv aj1-4 */
551 raj_inv = gmx_mm_inv_pd(raj);
553 /* Evaluate influence of atom aj -> ai */
554 t1 = _mm_add_sd(r, sk_aj);
555 t2 = _mm_sub_sd(r, sk_aj);
556 t3 = _mm_sub_sd(sk_aj, r);
557 obc_mask1 = _mm_cmplt_sd(rai, t1);
558 obc_mask2 = _mm_cmplt_sd(rai, t2);
559 obc_mask3 = _mm_cmplt_sd(rai, t3);
561 uij = gmx_mm_inv_pd(t1);
562 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
563 _mm_andnot_pd(obc_mask2, rai_inv));
564 dlij = _mm_and_pd(one, obc_mask2);
565 uij2 = _mm_mul_sd(uij, uij);
566 uij3 = _mm_mul_sd(uij2, uij);
567 lij2 = _mm_mul_sd(lij, lij);
568 lij3 = _mm_mul_sd(lij2, lij);
570 diff2 = _mm_sub_sd(uij2, lij2);
571 lij_inv = gmx_mm_invsqrt_pd(lij2);
572 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
573 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
574 prod = _mm_mul_sd(onefourth, sk2_rinv);
576 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
578 t1 = _mm_sub_sd(lij, uij);
579 t2 = _mm_mul_sd(diff2,
580 _mm_sub_sd(_mm_mul_pd(onefourth, r),
582 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
583 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
584 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
585 t4 = _mm_and_pd(t4, obc_mask3);
586 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
588 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
590 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
591 _mm_mul_sd(prod, lij3));
593 _mm_mul_sd(onefourth,
594 _mm_add_sd(_mm_mul_sd(lij, rinv),
595 _mm_mul_sd(lij3, r))));
596 t2 = _mm_mul_sd(onefourth,
597 _mm_add_sd(_mm_mul_sd(uij, rinv),
598 _mm_mul_sd(uij3, r)));
600 _mm_add_sd(_mm_mul_sd(half, uij2),
601 _mm_mul_sd(prod, uij3)));
602 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
603 _mm_mul_sd(rinv, rinv));
605 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
607 _mm_mul_sd(sk2_rinv, rinv))));
608 t1 = _mm_mul_sd(rinv,
609 _mm_add_sd(_mm_mul_sd(dlij, t1),
610 _mm_add_pd(t2, t3)));
612 dadx1 = _mm_and_pd(t1, obc_mask1);
614 /* Evaluate influence of atom ai -> aj */
615 t1 = _mm_add_sd(r, sk_ai);
616 t2 = _mm_sub_sd(r, sk_ai);
617 t3 = _mm_sub_sd(sk_ai, r);
618 obc_mask1 = _mm_cmplt_sd(raj, t1);
619 obc_mask2 = _mm_cmplt_sd(raj, t2);
620 obc_mask3 = _mm_cmplt_sd(raj, t3);
622 uij = gmx_mm_inv_pd(t1);
623 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
624 _mm_andnot_pd(obc_mask2, raj_inv));
625 dlij = _mm_and_pd(one, obc_mask2);
626 uij2 = _mm_mul_sd(uij, uij);
627 uij3 = _mm_mul_sd(uij2, uij);
628 lij2 = _mm_mul_sd(lij, lij);
629 lij3 = _mm_mul_sd(lij2, lij);
631 diff2 = _mm_sub_sd(uij2, lij2);
632 lij_inv = gmx_mm_invsqrt_pd(lij2);
633 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
634 prod = _mm_mul_sd(onefourth, sk2_rinv);
636 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
638 t1 = _mm_sub_sd(lij, uij);
639 t2 = _mm_mul_sd(diff2,
640 _mm_sub_sd(_mm_mul_sd(onefourth, r),
642 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
643 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
644 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
645 t4 = _mm_and_pd(t4, obc_mask3);
646 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
648 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
650 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
651 _mm_mul_sd(prod, lij3));
653 _mm_mul_sd(onefourth,
654 _mm_add_sd(_mm_mul_sd(lij, rinv),
655 _mm_mul_sd(lij3, r))));
656 t2 = _mm_mul_sd(onefourth,
657 _mm_add_sd(_mm_mul_sd(uij, rinv),
658 _mm_mul_sd(uij3, r)));
660 _mm_add_sd(_mm_mul_sd(half, uij2),
661 _mm_mul_sd(prod, uij3)));
662 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
663 _mm_mul_sd(rinv, rinv));
665 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
667 _mm_mul_sd(sk2_rinv, rinv))));
668 t1 = _mm_mul_sd(rinv,
669 _mm_add_sd(_mm_mul_sd(dlij, t1),
670 _mm_add_sd(t2, t3)));
672 dadx2 = _mm_and_pd(t1, obc_mask1);
674 _mm_store_pd(dadx, dadx1);
676 _mm_store_pd(dadx, dadx2);
679 gmx_mm_update_1pot_pd(sum_ai, work+ii);
683 /* Parallel summations */
684 if (DOMAINDECOMP(cr))
686 dd_atom_sum_real(cr->dd, work);
689 if (gb_algorithm == egbHCT)
692 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
694 if (born->use[i] != 0)
696 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
697 sum = 1.0/rr - work[i];
698 min_rad = rr + doffset;
701 born->bRad[i] = rad > min_rad ? rad : min_rad;
702 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
706 /* Extra communication required for DD */
707 if (DOMAINDECOMP(cr))
709 dd_atom_spread_real(cr->dd, born->bRad);
710 dd_atom_spread_real(cr->dd, fr->invsqrta);
716 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
718 if (born->use[i] != 0)
720 rr = top->atomtypes.gb_radius[md->typeA[i]];
728 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
729 born->bRad[i] = rr_inv - tsum*rr_inv2;
730 born->bRad[i] = 1.0 / born->bRad[i];
732 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
734 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
735 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
738 /* Extra (local) communication required for DD */
739 if (DOMAINDECOMP(cr))
741 dd_atom_spread_real(cr->dd, born->bRad);
742 dd_atom_spread_real(cr->dd, fr->invsqrta);
743 dd_atom_spread_real(cr->dd, born->drobc);
754 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
755 double *x, double *f, double *fshift, double *shiftvec,
756 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
758 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
763 double rbi, shX, shY, shZ;
768 __m128d fix, fiy, fiz;
772 __m128d rbai, rbaj, f_gb, f_gb_ai;
773 __m128d xmm1, xmm2, xmm3;
775 const __m128d two = _mm_set1_pd(2.0);
781 /* Loop to get the proper form for the Born radius term, sse style */
785 if (gb_algorithm == egbSTILL)
787 for (i = n0; i < n1; i++)
790 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
793 else if (gb_algorithm == egbHCT)
795 for (i = n0; i < n1; i++)
798 rb[i] = rbi * rbi * dvda[i];
801 else if (gb_algorithm == egbOBC)
803 for (i = n0; i < n1; i++)
806 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
810 jz = _mm_setzero_pd();
814 for (i = 0; i < nl->nri; i++)
818 is3 = 3*nl->shift[i];
820 shY = shiftvec[is3+1];
821 shZ = shiftvec[is3+2];
823 nj1 = nl->jindex[i+1];
825 ix = _mm_set1_pd(shX+x[ii3+0]);
826 iy = _mm_set1_pd(shY+x[ii3+1]);
827 iz = _mm_set1_pd(shZ+x[ii3+2]);
829 rbai = _mm_load1_pd(rb+ii);
830 fix = _mm_setzero_pd();
831 fiy = _mm_setzero_pd();
832 fiz = _mm_setzero_pd();
835 for (k = nj0; k < nj1-1; k += 2)
843 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
845 dx = _mm_sub_pd(ix, jx);
846 dy = _mm_sub_pd(iy, jy);
847 dz = _mm_sub_pd(iz, jz);
849 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
851 /* load chain rule terms for j1-4 */
852 f_gb = _mm_load_pd(dadx);
854 f_gb_ai = _mm_load_pd(dadx);
857 /* calculate scalar force */
858 f_gb = _mm_mul_pd(f_gb, rbai);
859 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
860 f_gb = _mm_add_pd(f_gb, f_gb_ai);
862 tx = _mm_mul_pd(f_gb, dx);
863 ty = _mm_mul_pd(f_gb, dy);
864 tz = _mm_mul_pd(f_gb, dz);
866 fix = _mm_add_pd(fix, tx);
867 fiy = _mm_add_pd(fiy, ty);
868 fiz = _mm_add_pd(fiz, tz);
870 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
873 /*deal with odd elements */
879 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
881 dx = _mm_sub_sd(ix, jx);
882 dy = _mm_sub_sd(iy, jy);
883 dz = _mm_sub_sd(iz, jz);
885 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
887 /* load chain rule terms */
888 f_gb = _mm_load_pd(dadx);
890 f_gb_ai = _mm_load_pd(dadx);
893 /* calculate scalar force */
894 f_gb = _mm_mul_sd(f_gb, rbai);
895 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
896 f_gb = _mm_add_sd(f_gb, f_gb_ai);
898 tx = _mm_mul_sd(f_gb, dx);
899 ty = _mm_mul_sd(f_gb, dy);
900 tz = _mm_mul_sd(f_gb, dz);
902 fix = _mm_add_sd(fix, tx);
903 fiy = _mm_add_sd(fiy, ty);
904 fiz = _mm_add_sd(fiz, tz);
906 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
909 /* fix/fiy/fiz now contain four partial force terms, that all should be
910 * added to the i particle forces and shift forces.
912 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
919 /* keep compiler happy */
920 int genborn_sse2_dummy;
922 #endif /* SSE2 intrinsics available */