1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2008, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Groningen Machine for Chemical Simulation
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
64 /* Only compile this file if SSE2 intrinsics are available */
65 #if 0 && defined (GMX_X86_SSE2)
66 #include <gmx_sse2_double.h>
67 #include <emmintrin.h>
69 #include "genborn_sse2_double.h"
72 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
73 int natoms, gmx_localtop_t *top,
74 const t_atomtypes *atype, double *x, t_nblist *nl,
77 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
78 int jnrA, jnrB, j3A, j3B;
95 __m128d rsq, rinv, rinv2, rinv4, rinv6;
96 __m128d ratio, gpi, rai, raj, vai, vaj, rvdw;
97 __m128d ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
98 __m128d mask, icf4, icf6, mask_cmp;
100 const __m128d half = _mm_set1_pd(0.5);
101 const __m128d three = _mm_set1_pd(3.0);
102 const __m128d one = _mm_set1_pd(1.0);
103 const __m128d two = _mm_set1_pd(2.0);
104 const __m128d zero = _mm_set1_pd(0.0);
105 const __m128d four = _mm_set1_pd(4.0);
107 const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV);
108 const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5);
109 const __m128d still_p4 = _mm_set1_pd(STILL_P4);
111 factor = 0.5 * ONE_4PI_EPS0;
113 gb_radius = born->gb_radius;
115 work = born->gpol_still_work;
117 shiftvec = fr->shift_vec[0];
121 jx = _mm_setzero_pd();
122 jy = _mm_setzero_pd();
123 jz = _mm_setzero_pd();
127 for (i = 0; i < natoms; i++)
132 for (i = 0; i < nl->nri; i++)
136 is3 = 3*nl->shift[i];
138 shY = shiftvec[is3+1];
139 shZ = shiftvec[is3+2];
141 nj1 = nl->jindex[i+1];
143 ix = _mm_set1_pd(shX+x[ii3+0]);
144 iy = _mm_set1_pd(shY+x[ii3+1]);
145 iz = _mm_set1_pd(shZ+x[ii3+2]);
148 /* Polarization energy for atom ai */
149 gpi = _mm_setzero_pd();
151 rai = _mm_load1_pd(gb_radius+ii);
152 prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
154 for (k = nj0; k < nj1-1; k += 2)
162 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
164 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
165 GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
167 dx = _mm_sub_pd(ix, jx);
168 dy = _mm_sub_pd(iy, jy);
169 dz = _mm_sub_pd(iz, jz);
171 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
172 rinv = gmx_mm_invsqrt_pd(rsq);
173 rinv2 = _mm_mul_pd(rinv, rinv);
174 rinv4 = _mm_mul_pd(rinv2, rinv2);
175 rinv6 = _mm_mul_pd(rinv4, rinv2);
177 rvdw = _mm_add_pd(rai, raj);
178 ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
180 mask_cmp = _mm_cmple_pd(ratio, still_p5inv);
182 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
183 if (0 == _mm_movemask_pd(mask_cmp) )
185 /* if ratio>still_p5inv for ALL elements */
187 dccf = _mm_setzero_pd();
191 ratio = _mm_min_pd(ratio, still_p5inv);
192 theta = _mm_mul_pd(ratio, still_pip5);
193 gmx_mm_sincos_pd(theta, &sinq, &cosq);
194 term = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
195 ccf = _mm_mul_pd(term, term);
196 dccf = _mm_mul_pd(_mm_mul_pd(two, term),
197 _mm_mul_pd(sinq, theta));
200 prod = _mm_mul_pd(still_p4, vaj);
201 icf4 = _mm_mul_pd(ccf, rinv4);
202 icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
204 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
206 gpi = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
208 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
210 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
220 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
222 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
223 GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
225 dx = _mm_sub_sd(ix, jx);
226 dy = _mm_sub_sd(iy, jy);
227 dz = _mm_sub_sd(iz, jz);
229 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
230 rinv = gmx_mm_invsqrt_pd(rsq);
231 rinv2 = _mm_mul_sd(rinv, rinv);
232 rinv4 = _mm_mul_sd(rinv2, rinv2);
233 rinv6 = _mm_mul_sd(rinv4, rinv2);
235 rvdw = _mm_add_sd(rai, raj);
236 ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
238 mask_cmp = _mm_cmple_sd(ratio, still_p5inv);
240 /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
241 if (0 == _mm_movemask_pd(mask_cmp) )
243 /* if ratio>still_p5inv for ALL elements */
245 dccf = _mm_setzero_pd();
249 ratio = _mm_min_sd(ratio, still_p5inv);
250 theta = _mm_mul_sd(ratio, still_pip5);
251 gmx_mm_sincos_pd(theta, &sinq, &cosq);
252 term = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
253 ccf = _mm_mul_sd(term, term);
254 dccf = _mm_mul_sd(_mm_mul_sd(two, term),
255 _mm_mul_sd(sinq, theta));
258 prod = _mm_mul_sd(still_p4, vaj);
259 icf4 = _mm_mul_sd(ccf, rinv4);
260 icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
262 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
264 gpi = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
266 _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
268 _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
271 gmx_mm_update_1pot_pd(gpi, work+ii);
274 /* Sum up the polarization energy from other nodes */
277 gmx_sum(natoms, work, cr);
279 else if (DOMAINDECOMP(cr))
281 dd_atom_sum_real(cr->dd, work);
284 /* Compute the radii */
285 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
287 if (born->use[i] != 0)
289 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
290 gpi2 = gpi_ai * gpi_ai;
291 born->bRad[i] = factor*gmx_invsqrt(gpi2);
292 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
296 /* Extra (local) communication required for DD */
297 if (DOMAINDECOMP(cr))
299 dd_atom_spread_real(cr->dd, born->bRad);
300 dd_atom_spread_real(cr->dd, fr->invsqrta);
308 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
309 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
311 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
314 double shX, shY, shZ;
315 double rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
316 double sum_ai2, sum_ai3, tsum, tchain, doffset;
325 __m128d ix, iy, iz, jx, jy, jz;
326 __m128d dx, dy, dz, t1, t2, t3, t4;
327 __m128d rsq, rinv, r;
328 __m128d rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
329 __m128d uij, lij2, uij2, lij3, uij3, diff2;
330 __m128d lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
331 __m128d sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
332 __m128d dadx1, dadx2;
335 __m128d obc_mask1, obc_mask2, obc_mask3;
337 __m128d oneeighth = _mm_set1_pd(0.125);
338 __m128d onefourth = _mm_set1_pd(0.25);
340 const __m128d half = _mm_set1_pd(0.5);
341 const __m128d three = _mm_set1_pd(3.0);
342 const __m128d one = _mm_set1_pd(1.0);
343 const __m128d two = _mm_set1_pd(2.0);
344 const __m128d zero = _mm_set1_pd(0.0);
345 const __m128d neg = _mm_set1_pd(-1.0);
347 /* Set the dielectric offset */
348 doffset = born->gb_doffset;
349 gb_radius = born->gb_radius;
350 obc_param = born->param;
351 work = born->gpol_hct_work;
354 shiftvec = fr->shift_vec[0];
356 jx = _mm_setzero_pd();
357 jy = _mm_setzero_pd();
358 jz = _mm_setzero_pd();
362 for (i = 0; i < born->nr; i++)
367 for (i = 0; i < nl->nri; i++)
371 is3 = 3*nl->shift[i];
373 shY = shiftvec[is3+1];
374 shZ = shiftvec[is3+2];
376 nj1 = nl->jindex[i+1];
378 ix = _mm_set1_pd(shX+x[ii3+0]);
379 iy = _mm_set1_pd(shY+x[ii3+1]);
380 iz = _mm_set1_pd(shZ+x[ii3+2]);
382 rai = _mm_load1_pd(gb_radius+ii);
383 rai_inv = gmx_mm_inv_pd(rai);
385 sum_ai = _mm_setzero_pd();
387 sk_ai = _mm_load1_pd(born->param+ii);
388 sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
390 for (k = nj0; k < nj1-1; k += 2)
398 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
399 GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
400 GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
402 dx = _mm_sub_pd(ix, jx);
403 dy = _mm_sub_pd(iy, jy);
404 dz = _mm_sub_pd(iz, jz);
406 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
408 rinv = gmx_mm_invsqrt_pd(rsq);
409 r = _mm_mul_pd(rsq, rinv);
411 /* Compute raj_inv aj1-4 */
412 raj_inv = gmx_mm_inv_pd(raj);
414 /* Evaluate influence of atom aj -> ai */
415 t1 = _mm_add_pd(r, sk_aj);
416 t2 = _mm_sub_pd(r, sk_aj);
417 t3 = _mm_sub_pd(sk_aj, r);
418 obc_mask1 = _mm_cmplt_pd(rai, t1);
419 obc_mask2 = _mm_cmplt_pd(rai, t2);
420 obc_mask3 = _mm_cmplt_pd(rai, t3);
422 uij = gmx_mm_inv_pd(t1);
423 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
424 _mm_andnot_pd(obc_mask2, rai_inv));
425 dlij = _mm_and_pd(one, obc_mask2);
426 uij2 = _mm_mul_pd(uij, uij);
427 uij3 = _mm_mul_pd(uij2, uij);
428 lij2 = _mm_mul_pd(lij, lij);
429 lij3 = _mm_mul_pd(lij2, lij);
431 diff2 = _mm_sub_pd(uij2, lij2);
432 lij_inv = gmx_mm_invsqrt_pd(lij2);
433 sk2_aj = _mm_mul_pd(sk_aj, sk_aj);
434 sk2_rinv = _mm_mul_pd(sk2_aj, rinv);
435 prod = _mm_mul_pd(onefourth, sk2_rinv);
437 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
439 t1 = _mm_sub_pd(lij, uij);
440 t2 = _mm_mul_pd(diff2,
441 _mm_sub_pd(_mm_mul_pd(onefourth, r),
443 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
444 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
445 t4 = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
446 t4 = _mm_and_pd(t4, obc_mask3);
447 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
449 sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
451 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
452 _mm_mul_pd(prod, lij3));
454 _mm_mul_pd(onefourth,
455 _mm_add_pd(_mm_mul_pd(lij, rinv),
456 _mm_mul_pd(lij3, r))));
457 t2 = _mm_mul_pd(onefourth,
458 _mm_add_pd(_mm_mul_pd(uij, rinv),
459 _mm_mul_pd(uij3, r)));
461 _mm_add_pd(_mm_mul_pd(half, uij2),
462 _mm_mul_pd(prod, uij3)));
463 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
464 _mm_mul_pd(rinv, rinv));
466 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
468 _mm_mul_pd(sk2_rinv, rinv))));
469 t1 = _mm_mul_pd(rinv,
470 _mm_add_pd(_mm_mul_pd(dlij, t1),
471 _mm_add_pd(t2, t3)));
473 dadx1 = _mm_and_pd(t1, obc_mask1);
475 /* Evaluate influence of atom ai -> aj */
476 t1 = _mm_add_pd(r, sk_ai);
477 t2 = _mm_sub_pd(r, sk_ai);
478 t3 = _mm_sub_pd(sk_ai, r);
479 obc_mask1 = _mm_cmplt_pd(raj, t1);
480 obc_mask2 = _mm_cmplt_pd(raj, t2);
481 obc_mask3 = _mm_cmplt_pd(raj, t3);
483 uij = gmx_mm_inv_pd(t1);
484 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
485 _mm_andnot_pd(obc_mask2, raj_inv));
486 dlij = _mm_and_pd(one, obc_mask2);
487 uij2 = _mm_mul_pd(uij, uij);
488 uij3 = _mm_mul_pd(uij2, uij);
489 lij2 = _mm_mul_pd(lij, lij);
490 lij3 = _mm_mul_pd(lij2, lij);
492 diff2 = _mm_sub_pd(uij2, lij2);
493 lij_inv = gmx_mm_invsqrt_pd(lij2);
494 sk2_rinv = _mm_mul_pd(sk2_ai, rinv);
495 prod = _mm_mul_pd(onefourth, sk2_rinv);
497 logterm = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
499 t1 = _mm_sub_pd(lij, uij);
500 t2 = _mm_mul_pd(diff2,
501 _mm_sub_pd(_mm_mul_pd(onefourth, r),
503 t3 = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
504 t1 = _mm_add_pd(t1, _mm_add_pd(t2, t3));
505 t4 = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
506 t4 = _mm_and_pd(t4, obc_mask3);
507 t1 = _mm_mul_pd(half, _mm_add_pd(t1, t4));
509 GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
511 t1 = _mm_add_pd(_mm_mul_pd(half, lij2),
512 _mm_mul_pd(prod, lij3));
514 _mm_mul_pd(onefourth,
515 _mm_add_pd(_mm_mul_pd(lij, rinv),
516 _mm_mul_pd(lij3, r))));
517 t2 = _mm_mul_pd(onefourth,
518 _mm_add_pd(_mm_mul_pd(uij, rinv),
519 _mm_mul_pd(uij3, r)));
521 _mm_add_pd(_mm_mul_pd(half, uij2),
522 _mm_mul_pd(prod, uij3)));
523 t3 = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
524 _mm_mul_pd(rinv, rinv));
526 _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
528 _mm_mul_pd(sk2_rinv, rinv))));
529 t1 = _mm_mul_pd(rinv,
530 _mm_add_pd(_mm_mul_pd(dlij, t1),
531 _mm_add_pd(t2, t3)));
533 dadx2 = _mm_and_pd(t1, obc_mask1);
535 _mm_store_pd(dadx, dadx1);
537 _mm_store_pd(dadx, dadx2);
539 } /* end normal inner loop */
547 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
548 GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
549 GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
551 dx = _mm_sub_sd(ix, jx);
552 dy = _mm_sub_sd(iy, jy);
553 dz = _mm_sub_sd(iz, jz);
555 rsq = gmx_mm_calc_rsq_pd(dx, dy, dz);
557 rinv = gmx_mm_invsqrt_pd(rsq);
558 r = _mm_mul_sd(rsq, rinv);
560 /* Compute raj_inv aj1-4 */
561 raj_inv = gmx_mm_inv_pd(raj);
563 /* Evaluate influence of atom aj -> ai */
564 t1 = _mm_add_sd(r, sk_aj);
565 t2 = _mm_sub_sd(r, sk_aj);
566 t3 = _mm_sub_sd(sk_aj, r);
567 obc_mask1 = _mm_cmplt_sd(rai, t1);
568 obc_mask2 = _mm_cmplt_sd(rai, t2);
569 obc_mask3 = _mm_cmplt_sd(rai, t3);
571 uij = gmx_mm_inv_pd(t1);
572 lij = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
573 _mm_andnot_pd(obc_mask2, rai_inv));
574 dlij = _mm_and_pd(one, obc_mask2);
575 uij2 = _mm_mul_sd(uij, uij);
576 uij3 = _mm_mul_sd(uij2, uij);
577 lij2 = _mm_mul_sd(lij, lij);
578 lij3 = _mm_mul_sd(lij2, lij);
580 diff2 = _mm_sub_sd(uij2, lij2);
581 lij_inv = gmx_mm_invsqrt_pd(lij2);
582 sk2_aj = _mm_mul_sd(sk_aj, sk_aj);
583 sk2_rinv = _mm_mul_sd(sk2_aj, rinv);
584 prod = _mm_mul_sd(onefourth, sk2_rinv);
586 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
588 t1 = _mm_sub_sd(lij, uij);
589 t2 = _mm_mul_sd(diff2,
590 _mm_sub_sd(_mm_mul_pd(onefourth, r),
592 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
593 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
594 t4 = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
595 t4 = _mm_and_pd(t4, obc_mask3);
596 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
598 sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
600 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
601 _mm_mul_sd(prod, lij3));
603 _mm_mul_sd(onefourth,
604 _mm_add_sd(_mm_mul_sd(lij, rinv),
605 _mm_mul_sd(lij3, r))));
606 t2 = _mm_mul_sd(onefourth,
607 _mm_add_sd(_mm_mul_sd(uij, rinv),
608 _mm_mul_sd(uij3, r)));
610 _mm_add_sd(_mm_mul_sd(half, uij2),
611 _mm_mul_sd(prod, uij3)));
612 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
613 _mm_mul_sd(rinv, rinv));
615 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
617 _mm_mul_sd(sk2_rinv, rinv))));
618 t1 = _mm_mul_sd(rinv,
619 _mm_add_sd(_mm_mul_sd(dlij, t1),
620 _mm_add_pd(t2, t3)));
622 dadx1 = _mm_and_pd(t1, obc_mask1);
624 /* Evaluate influence of atom ai -> aj */
625 t1 = _mm_add_sd(r, sk_ai);
626 t2 = _mm_sub_sd(r, sk_ai);
627 t3 = _mm_sub_sd(sk_ai, r);
628 obc_mask1 = _mm_cmplt_sd(raj, t1);
629 obc_mask2 = _mm_cmplt_sd(raj, t2);
630 obc_mask3 = _mm_cmplt_sd(raj, t3);
632 uij = gmx_mm_inv_pd(t1);
633 lij = _mm_or_pd( _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
634 _mm_andnot_pd(obc_mask2, raj_inv));
635 dlij = _mm_and_pd(one, obc_mask2);
636 uij2 = _mm_mul_sd(uij, uij);
637 uij3 = _mm_mul_sd(uij2, uij);
638 lij2 = _mm_mul_sd(lij, lij);
639 lij3 = _mm_mul_sd(lij2, lij);
641 diff2 = _mm_sub_sd(uij2, lij2);
642 lij_inv = gmx_mm_invsqrt_pd(lij2);
643 sk2_rinv = _mm_mul_sd(sk2_ai, rinv);
644 prod = _mm_mul_sd(onefourth, sk2_rinv);
646 logterm = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
648 t1 = _mm_sub_sd(lij, uij);
649 t2 = _mm_mul_sd(diff2,
650 _mm_sub_sd(_mm_mul_sd(onefourth, r),
652 t3 = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
653 t1 = _mm_add_sd(t1, _mm_add_sd(t2, t3));
654 t4 = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
655 t4 = _mm_and_pd(t4, obc_mask3);
656 t1 = _mm_mul_sd(half, _mm_add_sd(t1, t4));
658 GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
660 t1 = _mm_add_sd(_mm_mul_sd(half, lij2),
661 _mm_mul_sd(prod, lij3));
663 _mm_mul_sd(onefourth,
664 _mm_add_sd(_mm_mul_sd(lij, rinv),
665 _mm_mul_sd(lij3, r))));
666 t2 = _mm_mul_sd(onefourth,
667 _mm_add_sd(_mm_mul_sd(uij, rinv),
668 _mm_mul_sd(uij3, r)));
670 _mm_add_sd(_mm_mul_sd(half, uij2),
671 _mm_mul_sd(prod, uij3)));
672 t3 = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
673 _mm_mul_sd(rinv, rinv));
675 _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
677 _mm_mul_sd(sk2_rinv, rinv))));
678 t1 = _mm_mul_sd(rinv,
679 _mm_add_sd(_mm_mul_sd(dlij, t1),
680 _mm_add_sd(t2, t3)));
682 dadx2 = _mm_and_pd(t1, obc_mask1);
684 _mm_store_pd(dadx, dadx1);
686 _mm_store_pd(dadx, dadx2);
689 gmx_mm_update_1pot_pd(sum_ai, work+ii);
693 /* Parallel summations */
696 gmx_sum(natoms, work, cr);
698 else if (DOMAINDECOMP(cr))
700 dd_atom_sum_real(cr->dd, work);
703 if (gb_algorithm == egbHCT)
706 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
708 if (born->use[i] != 0)
710 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
711 sum = 1.0/rr - work[i];
712 min_rad = rr + doffset;
715 born->bRad[i] = rad > min_rad ? rad : min_rad;
716 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
720 /* Extra communication required for DD */
721 if (DOMAINDECOMP(cr))
723 dd_atom_spread_real(cr->dd, born->bRad);
724 dd_atom_spread_real(cr->dd, fr->invsqrta);
730 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
732 if (born->use[i] != 0)
734 rr = top->atomtypes.gb_radius[md->typeA[i]];
742 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
743 born->bRad[i] = rr_inv - tsum*rr_inv2;
744 born->bRad[i] = 1.0 / born->bRad[i];
746 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
748 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
749 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
752 /* Extra (local) communication required for DD */
753 if (DOMAINDECOMP(cr))
755 dd_atom_spread_real(cr->dd, born->bRad);
756 dd_atom_spread_real(cr->dd, fr->invsqrta);
757 dd_atom_spread_real(cr->dd, born->drobc);
768 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
769 double *x, double *f, double *fshift, double *shiftvec,
770 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
772 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
777 double rbi, shX, shY, shZ;
782 __m128d fix, fiy, fiz;
786 __m128d rbai, rbaj, f_gb, f_gb_ai;
787 __m128d xmm1, xmm2, xmm3;
789 const __m128d two = _mm_set1_pd(2.0);
795 /* Loop to get the proper form for the Born radius term, sse style */
799 if (gb_algorithm == egbSTILL)
801 for (i = n0; i < n1; i++)
804 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
807 else if (gb_algorithm == egbHCT)
809 for (i = n0; i < n1; i++)
812 rb[i] = rbi * rbi * dvda[i];
815 else if (gb_algorithm == egbOBC)
817 for (i = n0; i < n1; i++)
820 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
824 jz = _mm_setzero_pd();
828 for (i = 0; i < nl->nri; i++)
832 is3 = 3*nl->shift[i];
834 shY = shiftvec[is3+1];
835 shZ = shiftvec[is3+2];
837 nj1 = nl->jindex[i+1];
839 ix = _mm_set1_pd(shX+x[ii3+0]);
840 iy = _mm_set1_pd(shY+x[ii3+1]);
841 iz = _mm_set1_pd(shZ+x[ii3+2]);
843 rbai = _mm_load1_pd(rb+ii);
844 fix = _mm_setzero_pd();
845 fiy = _mm_setzero_pd();
846 fiz = _mm_setzero_pd();
849 for (k = nj0; k < nj1-1; k += 2)
857 GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
859 dx = _mm_sub_pd(ix, jx);
860 dy = _mm_sub_pd(iy, jy);
861 dz = _mm_sub_pd(iz, jz);
863 GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
865 /* load chain rule terms for j1-4 */
866 f_gb = _mm_load_pd(dadx);
868 f_gb_ai = _mm_load_pd(dadx);
871 /* calculate scalar force */
872 f_gb = _mm_mul_pd(f_gb, rbai);
873 f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
874 f_gb = _mm_add_pd(f_gb, f_gb_ai);
876 tx = _mm_mul_pd(f_gb, dx);
877 ty = _mm_mul_pd(f_gb, dy);
878 tz = _mm_mul_pd(f_gb, dz);
880 fix = _mm_add_pd(fix, tx);
881 fiy = _mm_add_pd(fiy, ty);
882 fiz = _mm_add_pd(fiz, tz);
884 GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
887 /*deal with odd elements */
893 GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
895 dx = _mm_sub_sd(ix, jx);
896 dy = _mm_sub_sd(iy, jy);
897 dz = _mm_sub_sd(iz, jz);
899 GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
901 /* load chain rule terms */
902 f_gb = _mm_load_pd(dadx);
904 f_gb_ai = _mm_load_pd(dadx);
907 /* calculate scalar force */
908 f_gb = _mm_mul_sd(f_gb, rbai);
909 f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
910 f_gb = _mm_add_sd(f_gb, f_gb_ai);
912 tx = _mm_mul_sd(f_gb, dx);
913 ty = _mm_mul_sd(f_gb, dy);
914 tz = _mm_mul_sd(f_gb, dz);
916 fix = _mm_add_sd(fix, tx);
917 fiy = _mm_add_sd(fiy, ty);
918 fiz = _mm_add_sd(fiz, tz);
920 GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
923 /* fix/fiy/fiz now contain four partial force terms, that all should be
924 * added to the i particle forces and shift forces.
926 gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
933 /* keep compiler happy */
934 int genborn_sse2_dummy;
936 #endif /* SSE2 intrinsics available */