2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/fileio/pdbio.h"
43 #include "gromacs/legacyheaders/domdec.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/legacyheaders/names.h"
46 #include "gromacs/legacyheaders/network.h"
47 #include "gromacs/legacyheaders/typedefs.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/math/vec.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
54 /* Only compile this file if SSE2 intrinsics are available */
55 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
56 #include "genborn_sse2_double.h"
58 #include <emmintrin.h>
59 #include <gmx_sse2_double.h>
62 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
63 int natoms, gmx_localtop_t *top,
64 double *x, t_nblist *nl,
67 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
68 int jnrA, jnrB, j3A, j3B;
85 __m128d rsq, rinv, rinv2, rinv4, rinv6;
86 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
87 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
88 __m128d mask, icf4, icf6, mask_cmp;
90 const __m128d half = _mm_set1_pd(0.5);
91 const __m128d three = _mm_set1_pd(3.0);
92 const __m128d one = _mm_set1_pd(1.0);
93 const __m128d two = _mm_set1_pd(2.0);
94 const __m128d zero = _mm_set1_pd(0.0);
95 const __m128d four = _mm_set1_pd(4.0);
97 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
98 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
99 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
101 factor = 0.5 * ONE_4PI_EPS0;
103 gb_radius = born->gb_radius;
105 work = born->gpol_still_work;
107 shiftvec = fr->shift_vec[0];
111 jx = _mm_setzero_pd();
112 jy = _mm_setzero_pd();
113 jz = _mm_setzero_pd();
117 for (i = 0; i < natoms; i++)
122 for (i = 0; i < nl->nri; i++)
126 is3 = 3*nl->shift[i];
128 shY = shiftvec[is3+1];
129 shZ = shiftvec[is3+2];
131 nj1 = nl->jindex[i+1];
133 ix = _mm_set1_pd(shX+x[ii3+0]);
134 iy = _mm_set1_pd(shY+x[ii3+1]);
135 iz = _mm_set1_pd(shZ+x[ii3+2]);
138 /* Polarization energy for atom ai */
139 gpi = _mm_setzero_pd();
141 rai = _mm_load1_pd(gb_radius+ii);
142 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
144 for (k = nj0; k < nj1-1; k += 2)
152 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
154 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
155 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
157 dx = _mm_sub_pd(ix, jx);
158 dy = _mm_sub_pd(iy, jy);
159 dz = _mm_sub_pd(iz, jz);
161 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
162 rinv = gmx_mm_invsqrt_pd(rsq);
163 rinv2 = _mm_mul_pd(rinv, rinv);
164 rinv4 = _mm_mul_pd(rinv2, rinv2);
165 rinv6 = _mm_mul_pd(rinv4, rinv2);
167 rvdw = _mm_add_pd(rai, raj);
168 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
170 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
172 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
173 if (0 == _mm_movemask_pd(mask_cmp) )
175 /* if ratio>still_p5inv for ALL elements */
177 dccf = _mm_setzero_pd();
181 ratio = _mm_min_pd(ratio, still_p5inv);
182 theta = _mm_mul_pd(ratio, still_pip5);
183 gmx_mm_sincos_pd(theta, &sinq, &cosq);
184 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
185 ccf = _mm_mul_pd(term, term);
186 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
187 _mm_mul_pd(sinq, theta));
190 prod = _mm_mul_pd(still_p4, vaj);
191 icf4 = _mm_mul_pd(ccf, rinv4);
192 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
194 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
196 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
198 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
200 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
210 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
212 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
213 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
215 dx = _mm_sub_sd(ix, jx);
216 dy = _mm_sub_sd(iy, jy);
217 dz = _mm_sub_sd(iz, jz);
219 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
220 rinv = gmx_mm_invsqrt_pd(rsq);
221 rinv2 = _mm_mul_sd(rinv, rinv);
222 rinv4 = _mm_mul_sd(rinv2, rinv2);
223 rinv6 = _mm_mul_sd(rinv4, rinv2);
225 rvdw = _mm_add_sd(rai, raj);
226 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
228 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
230 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
231 if (0 == _mm_movemask_pd(mask_cmp) )
233 /* if ratio>still_p5inv for ALL elements */
235 dccf = _mm_setzero_pd();
239 ratio = _mm_min_sd(ratio, still_p5inv);
240 theta = _mm_mul_sd(ratio, still_pip5);
241 gmx_mm_sincos_pd(theta, &sinq, &cosq);
242 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
243 ccf = _mm_mul_sd(term, term);
244 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
245 _mm_mul_sd(sinq, theta));
248 prod = _mm_mul_sd(still_p4, vaj);
249 icf4 = _mm_mul_sd(ccf, rinv4);
250 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
252 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
254 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
256 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
258 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
261 gmx_mm_update_1pot_pd(gpi, work+ii);
264 /* Sum up the polarization energy from other nodes */
265 if (DOMAINDECOMP(cr))
267 dd_atom_sum_real(cr->dd, work);
270 /* Compute the radii */
271 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
273 if (born->use[i] != 0)
275 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
276 gpi2 = gpi_ai * gpi_ai;
277 born->bRad[i] = factor*gmx_invsqrt(gpi2);
278 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
282 /* Extra (local) communication required for DD */
283 if (DOMAINDECOMP(cr))
285 dd_atom_spread_real(cr->dd, born->bRad);
286 dd_atom_spread_real(cr->dd, fr->invsqrta);
294 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
295 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
297 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
300 double shX, shY, shZ;
301 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
302 double sum_ai2, sum_ai3, tsum, tchain, doffset;
311 __m128d ix, iy, iz, jx, jy, jz;
312 __m128d dx, dy, dz, t1, t2, t3, t4;
313 __m128d rsq, rinv, r;
314 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
315 __m128d uij, lij2, uij2, lij3, uij3, diff2;
316 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
317 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
318 __m128d dadx1, dadx2;
321 __m128d obc_mask1, obc_mask2, obc_mask3;
323 __m128d oneeighth = _mm_set1_pd(0.125);
324 __m128d onefourth = _mm_set1_pd(0.25);
326 const __m128d half = _mm_set1_pd(0.5);
327 const __m128d three = _mm_set1_pd(3.0);
328 const __m128d one = _mm_set1_pd(1.0);
329 const __m128d two = _mm_set1_pd(2.0);
330 const __m128d zero = _mm_set1_pd(0.0);
331 const __m128d neg = _mm_set1_pd(-1.0);
333 /* Set the dielectric offset */
334 doffset = born->gb_doffset;
335 gb_radius = born->gb_radius;
336 obc_param = born->param;
337 work = born->gpol_hct_work;
340 shiftvec = fr->shift_vec[0];
342 jx = _mm_setzero_pd();
343 jy = _mm_setzero_pd();
344 jz = _mm_setzero_pd();
348 for (i = 0; i < born->nr; i++)
353 for (i = 0; i < nl->nri; i++)
357 is3 = 3*nl->shift[i];
359 shY = shiftvec[is3+1];
360 shZ = shiftvec[is3+2];
362 nj1 = nl->jindex[i+1];
364 ix = _mm_set1_pd(shX+x[ii3+0]);
365 iy = _mm_set1_pd(shY+x[ii3+1]);
366 iz = _mm_set1_pd(shZ+x[ii3+2]);
368 rai = _mm_load1_pd(gb_radius+ii);
369 rai_inv = gmx_mm_inv_pd(rai);
371 sum_ai = _mm_setzero_pd();
373 sk_ai = _mm_load1_pd(born->param+ii);
374 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
376 for (k = nj0; k < nj1-1; k += 2)
384 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
385 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
386 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
388 dx = _mm_sub_pd(ix, jx);
389 dy = _mm_sub_pd(iy, jy);
390 dz = _mm_sub_pd(iz, jz);
392 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
394 rinv = gmx_mm_invsqrt_pd(rsq);
395 r = _mm_mul_pd(rsq, rinv);
397 /* Compute raj_inv aj1-4 */
398 raj_inv = gmx_mm_inv_pd(raj);
400 /* Evaluate influence of atom aj -> ai */
401 t1 = _mm_add_pd(r, sk_aj);
402 t2 = _mm_sub_pd(r, sk_aj);
403 t3 = _mm_sub_pd(sk_aj, r);
404 obc_mask1 = _mm_cmplt_pd(rai, t1);
405 obc_mask2 = _mm_cmplt_pd(rai, t2);
406 obc_mask3 = _mm_cmplt_pd(rai, t3);
408 uij = gmx_mm_inv_pd(t1);
409 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
410 _mm_andnot_pd(obc_mask2, rai_inv));
411 dlij = _mm_and_pd(one, obc_mask2);
412 uij2 = _mm_mul_pd(uij, uij);
413 uij3 = _mm_mul_pd(uij2, uij);
414 lij2 = _mm_mul_pd(lij, lij);
415 lij3 = _mm_mul_pd(lij2, lij);
417 diff2 = _mm_sub_pd(uij2, lij2);
418 lij_inv = gmx_mm_invsqrt_pd(lij2);
419 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
420 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
421 prod = _mm_mul_pd(onefourth, sk2_rinv);
423 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
425 t1 = _mm_sub_pd(lij, uij);
426 t2 = _mm_mul_pd(diff2,
427 _mm_sub_pd(_mm_mul_pd(onefourth, r),
429 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
430 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
431 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
432 t4 = _mm_and_pd(t4, obc_mask3);
433 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
435 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
437 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
438 _mm_mul_pd(prod, lij3));
440 _mm_mul_pd(onefourth,
441 _mm_add_pd(_mm_mul_pd(lij, rinv),
442 _mm_mul_pd(lij3, r))));
443 t2 = _mm_mul_pd(onefourth,
444 _mm_add_pd(_mm_mul_pd(uij, rinv),
445 _mm_mul_pd(uij3, r)));
447 _mm_add_pd(_mm_mul_pd(half, uij2),
448 _mm_mul_pd(prod, uij3)));
449 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
450 _mm_mul_pd(rinv, rinv));
452 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
454 _mm_mul_pd(sk2_rinv, rinv))));
455 t1 = _mm_mul_pd(rinv,
456 _mm_add_pd(_mm_mul_pd(dlij, t1),
457 _mm_add_pd(t2, t3)));
459 dadx1 = _mm_and_pd(t1, obc_mask1);
461 /* Evaluate influence of atom ai -> aj */
462 t1 = _mm_add_pd(r, sk_ai);
463 t2 = _mm_sub_pd(r, sk_ai);
464 t3 = _mm_sub_pd(sk_ai, r);
465 obc_mask1 = _mm_cmplt_pd(raj, t1);
466 obc_mask2 = _mm_cmplt_pd(raj, t2);
467 obc_mask3 = _mm_cmplt_pd(raj, t3);
469 uij = gmx_mm_inv_pd(t1);
470 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
471 _mm_andnot_pd(obc_mask2, raj_inv));
472 dlij = _mm_and_pd(one, obc_mask2);
473 uij2 = _mm_mul_pd(uij, uij);
474 uij3 = _mm_mul_pd(uij2, uij);
475 lij2 = _mm_mul_pd(lij, lij);
476 lij3 = _mm_mul_pd(lij2, lij);
478 diff2 = _mm_sub_pd(uij2, lij2);
479 lij_inv = gmx_mm_invsqrt_pd(lij2);
480 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
481 prod = _mm_mul_pd(onefourth, sk2_rinv);
483 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
485 t1 = _mm_sub_pd(lij, uij);
486 t2 = _mm_mul_pd(diff2,
487 _mm_sub_pd(_mm_mul_pd(onefourth, r),
489 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
490 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
491 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
492 t4 = _mm_and_pd(t4, obc_mask3);
493 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
495 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
497 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
498 _mm_mul_pd(prod, lij3));
500 _mm_mul_pd(onefourth,
501 _mm_add_pd(_mm_mul_pd(lij, rinv),
502 _mm_mul_pd(lij3, r))));
503 t2 = _mm_mul_pd(onefourth,
504 _mm_add_pd(_mm_mul_pd(uij, rinv),
505 _mm_mul_pd(uij3, r)));
507 _mm_add_pd(_mm_mul_pd(half, uij2),
508 _mm_mul_pd(prod, uij3)));
509 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
510 _mm_mul_pd(rinv, rinv));
512 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
514 _mm_mul_pd(sk2_rinv, rinv))));
515 t1 = _mm_mul_pd(rinv,
516 _mm_add_pd(_mm_mul_pd(dlij, t1),
517 _mm_add_pd(t2, t3)));
519 dadx2 = _mm_and_pd(t1, obc_mask1);
521 _mm_store_pd(dadx, dadx1);
523 _mm_store_pd(dadx, dadx2);
525 } /* end normal inner loop */
533 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
534 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
535 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
537 dx = _mm_sub_sd(ix, jx);
538 dy = _mm_sub_sd(iy, jy);
539 dz = _mm_sub_sd(iz, jz);
541 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
543 rinv = gmx_mm_invsqrt_pd(rsq);
544 r = _mm_mul_sd(rsq, rinv);
546 /* Compute raj_inv aj1-4 */
547 raj_inv = gmx_mm_inv_pd(raj);
549 /* Evaluate influence of atom aj -> ai */
550 t1 = _mm_add_sd(r, sk_aj);
551 t2 = _mm_sub_sd(r, sk_aj);
552 t3 = _mm_sub_sd(sk_aj, r);
553 obc_mask1 = _mm_cmplt_sd(rai, t1);
554 obc_mask2 = _mm_cmplt_sd(rai, t2);
555 obc_mask3 = _mm_cmplt_sd(rai, t3);
557 uij = gmx_mm_inv_pd(t1);
558 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
559 _mm_andnot_pd(obc_mask2, rai_inv));
560 dlij = _mm_and_pd(one, obc_mask2);
561 uij2 = _mm_mul_sd(uij, uij);
562 uij3 = _mm_mul_sd(uij2, uij);
563 lij2 = _mm_mul_sd(lij, lij);
564 lij3 = _mm_mul_sd(lij2, lij);
566 diff2 = _mm_sub_sd(uij2, lij2);
567 lij_inv = gmx_mm_invsqrt_pd(lij2);
568 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
569 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
570 prod = _mm_mul_sd(onefourth, sk2_rinv);
572 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
574 t1 = _mm_sub_sd(lij, uij);
575 t2 = _mm_mul_sd(diff2,
576 _mm_sub_sd(_mm_mul_pd(onefourth, r),
578 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
579 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
580 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
581 t4 = _mm_and_pd(t4, obc_mask3);
582 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
584 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
586 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
587 _mm_mul_sd(prod, lij3));
589 _mm_mul_sd(onefourth,
590 _mm_add_sd(_mm_mul_sd(lij, rinv),
591 _mm_mul_sd(lij3, r))));
592 t2 = _mm_mul_sd(onefourth,
593 _mm_add_sd(_mm_mul_sd(uij, rinv),
594 _mm_mul_sd(uij3, r)));
596 _mm_add_sd(_mm_mul_sd(half, uij2),
597 _mm_mul_sd(prod, uij3)));
598 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
599 _mm_mul_sd(rinv, rinv));
601 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
603 _mm_mul_sd(sk2_rinv, rinv))));
604 t1 = _mm_mul_sd(rinv,
605 _mm_add_sd(_mm_mul_sd(dlij, t1),
606 _mm_add_pd(t2, t3)));
608 dadx1 = _mm_and_pd(t1, obc_mask1);
610 /* Evaluate influence of atom ai -> aj */
611 t1 = _mm_add_sd(r, sk_ai);
612 t2 = _mm_sub_sd(r, sk_ai);
613 t3 = _mm_sub_sd(sk_ai, r);
614 obc_mask1 = _mm_cmplt_sd(raj, t1);
615 obc_mask2 = _mm_cmplt_sd(raj, t2);
616 obc_mask3 = _mm_cmplt_sd(raj, t3);
618 uij = gmx_mm_inv_pd(t1);
619 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
620 _mm_andnot_pd(obc_mask2, raj_inv));
621 dlij = _mm_and_pd(one, obc_mask2);
622 uij2 = _mm_mul_sd(uij, uij);
623 uij3 = _mm_mul_sd(uij2, uij);
624 lij2 = _mm_mul_sd(lij, lij);
625 lij3 = _mm_mul_sd(lij2, lij);
627 diff2 = _mm_sub_sd(uij2, lij2);
628 lij_inv = gmx_mm_invsqrt_pd(lij2);
629 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
630 prod = _mm_mul_sd(onefourth, sk2_rinv);
632 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
634 t1 = _mm_sub_sd(lij, uij);
635 t2 = _mm_mul_sd(diff2,
636 _mm_sub_sd(_mm_mul_sd(onefourth, r),
638 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
639 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
640 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
641 t4 = _mm_and_pd(t4, obc_mask3);
642 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
644 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
646 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
647 _mm_mul_sd(prod, lij3));
649 _mm_mul_sd(onefourth,
650 _mm_add_sd(_mm_mul_sd(lij, rinv),
651 _mm_mul_sd(lij3, r))));
652 t2 = _mm_mul_sd(onefourth,
653 _mm_add_sd(_mm_mul_sd(uij, rinv),
654 _mm_mul_sd(uij3, r)));
656 _mm_add_sd(_mm_mul_sd(half, uij2),
657 _mm_mul_sd(prod, uij3)));
658 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
659 _mm_mul_sd(rinv, rinv));
661 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
663 _mm_mul_sd(sk2_rinv, rinv))));
664 t1 = _mm_mul_sd(rinv,
665 _mm_add_sd(_mm_mul_sd(dlij, t1),
666 _mm_add_sd(t2, t3)));
668 dadx2 = _mm_and_pd(t1, obc_mask1);
670 _mm_store_pd(dadx, dadx1);
672 _mm_store_pd(dadx, dadx2);
675 gmx_mm_update_1pot_pd(sum_ai, work+ii);
679 /* Parallel summations */
680 if (DOMAINDECOMP(cr))
682 dd_atom_sum_real(cr->dd, work);
685 if (gb_algorithm == egbHCT)
688 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
690 if (born->use[i] != 0)
692 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
693 sum = 1.0/rr - work[i];
694 min_rad = rr + doffset;
697 born->bRad[i] = rad > min_rad ? rad : min_rad;
698 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
702 /* Extra communication required for DD */
703 if (DOMAINDECOMP(cr))
705 dd_atom_spread_real(cr->dd, born->bRad);
706 dd_atom_spread_real(cr->dd, fr->invsqrta);
712 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
714 if (born->use[i] != 0)
716 rr = top->atomtypes.gb_radius[md->typeA[i]];
724 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
725 born->bRad[i] = rr_inv - tsum*rr_inv2;
726 born->bRad[i] = 1.0 / born->bRad[i];
728 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
730 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
731 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
734 /* Extra (local) communication required for DD */
735 if (DOMAINDECOMP(cr))
737 dd_atom_spread_real(cr->dd, born->bRad);
738 dd_atom_spread_real(cr->dd, fr->invsqrta);
739 dd_atom_spread_real(cr->dd, born->drobc);
750 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
751 double *x, double *f, double *fshift, double *shiftvec,
752 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
754 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
759 double rbi, shX, shY, shZ;
764 __m128d fix, fiy, fiz;
768 __m128d rbai, rbaj, f_gb, f_gb_ai;
769 __m128d xmm1, xmm2, xmm3;
771 const __m128d two = _mm_set1_pd(2.0);
777 /* Loop to get the proper form for the Born radius term, sse style */
781 if (gb_algorithm == egbSTILL)
783 for (i = n0; i < n1; i++)
786 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
789 else if (gb_algorithm == egbHCT)
791 for (i = n0; i < n1; i++)
794 rb[i] = rbi * rbi * dvda[i];
797 else if (gb_algorithm == egbOBC)
799 for (i = n0; i < n1; i++)
802 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
806 jz = _mm_setzero_pd();
810 for (i = 0; i < nl->nri; i++)
814 is3 = 3*nl->shift[i];
816 shY = shiftvec[is3+1];
817 shZ = shiftvec[is3+2];
819 nj1 = nl->jindex[i+1];
821 ix = _mm_set1_pd(shX+x[ii3+0]);
822 iy = _mm_set1_pd(shY+x[ii3+1]);
823 iz = _mm_set1_pd(shZ+x[ii3+2]);
825 rbai = _mm_load1_pd(rb+ii);
826 fix = _mm_setzero_pd();
827 fiy = _mm_setzero_pd();
828 fiz = _mm_setzero_pd();
831 for (k = nj0; k < nj1-1; k += 2)
839 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
841 dx = _mm_sub_pd(ix, jx);
842 dy = _mm_sub_pd(iy, jy);
843 dz = _mm_sub_pd(iz, jz);
845 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
847 /* load chain rule terms for j1-4 */
848 f_gb = _mm_load_pd(dadx);
850 f_gb_ai = _mm_load_pd(dadx);
853 /* calculate scalar force */
854 f_gb = _mm_mul_pd(f_gb, rbai);
855 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
856 f_gb = _mm_add_pd(f_gb, f_gb_ai);
858 tx = _mm_mul_pd(f_gb, dx);
859 ty = _mm_mul_pd(f_gb, dy);
860 tz = _mm_mul_pd(f_gb, dz);
862 fix = _mm_add_pd(fix, tx);
863 fiy = _mm_add_pd(fiy, ty);
864 fiz = _mm_add_pd(fiz, tz);
866 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
869 /*deal with odd elements */
875 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
877 dx = _mm_sub_sd(ix, jx);
878 dy = _mm_sub_sd(iy, jy);
879 dz = _mm_sub_sd(iz, jz);
881 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
883 /* load chain rule terms */
884 f_gb = _mm_load_pd(dadx);
886 f_gb_ai = _mm_load_pd(dadx);
889 /* calculate scalar force */
890 f_gb = _mm_mul_sd(f_gb, rbai);
891 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
892 f_gb = _mm_add_sd(f_gb, f_gb_ai);
894 tx = _mm_mul_sd(f_gb, dx);
895 ty = _mm_mul_sd(f_gb, dy);
896 tz = _mm_mul_sd(f_gb, dz);
898 fix = _mm_add_sd(fix, tx);
899 fiy = _mm_add_sd(fiy, ty);
900 fiz = _mm_add_sd(fiz, tz);
902 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
905 /* fix/fiy/fiz now contain four partial force terms, that all should be
906 * added to the i particle forces and shift forces.
908 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
915 /* keep compiler happy */
916 int genborn_sse2_dummy;
918 #endif /* SSE2 intrinsics available */