2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
48 #include "gromacs/fileio/pdbio.h"
54 #include "gmx_fatal.h"
55 #include "mtop_util.h"
58 #include "gromacs/utility/gmxmpi.h"
61 /* Only compile this file if SSE intrinsics are available */
62 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
64 #include <gmx_sse2_single.h>
65 #include <emmintrin.h>
67 #include "genborn_sse2_single.h"
71 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
72 int natoms, gmx_localtop_t *top,
73 float *x, t_nblist *nl,
76 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
77 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
78 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
99 __m128 rsq, rinv, rinv2, rinv4, rinv6;
100 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
101 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
102 __m128 ratioB, rajB, vajB, rvdwB;
103 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
104 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
105 __m128 mask, icf4, icf6, mask_cmp;
106 __m128 icf4B, icf6B, mask_cmpB;
108 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
109 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
110 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
112 const __m128 half = _mm_set1_ps(0.5f);
113 const __m128 three = _mm_set1_ps(3.0f);
114 const __m128 one = _mm_set1_ps(1.0f);
115 const __m128 two = _mm_set1_ps(2.0f);
116 const __m128 zero = _mm_set1_ps(0.0f);
117 const __m128 four = _mm_set1_ps(4.0f);
119 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
120 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
121 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
123 factor = 0.5 * ONE_4PI_EPS0;
125 gb_radius = born->gb_radius;
127 work = born->gpol_still_work;
129 shiftvec = fr->shift_vec[0];
132 jnrA = jnrB = jnrC = jnrD = 0;
133 jx = _mm_setzero_ps();
134 jy = _mm_setzero_ps();
135 jz = _mm_setzero_ps();
139 for (i = 0; i < natoms; i++)
144 for (i = 0; i < nl->nri; i++)
148 is3 = 3*nl->shift[i];
150 shY = shiftvec[is3+1];
151 shZ = shiftvec[is3+2];
153 nj1 = nl->jindex[i+1];
155 ix = _mm_set1_ps(shX+x[ii3+0]);
156 iy = _mm_set1_ps(shY+x[ii3+1]);
157 iz = _mm_set1_ps(shZ+x[ii3+2]);
159 offset = (nj1-nj0)%4;
161 /* Polarization energy for atom ai */
162 gpi = _mm_setzero_ps();
164 rai = _mm_load1_ps(gb_radius+ii);
165 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
167 for (k = nj0; k < nj1-4-offset; k += 8)
187 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
188 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
190 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
191 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
192 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
193 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
195 dx = _mm_sub_ps(ix, jx);
196 dy = _mm_sub_ps(iy, jy);
197 dz = _mm_sub_ps(iz, jz);
198 dxB = _mm_sub_ps(ix, jxB);
199 dyB = _mm_sub_ps(iy, jyB);
200 dzB = _mm_sub_ps(iz, jzB);
202 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
203 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
204 rinv = gmx_mm_invsqrt_ps(rsq);
205 rinvB = gmx_mm_invsqrt_ps(rsqB);
206 rinv2 = _mm_mul_ps(rinv, rinv);
207 rinv2B = _mm_mul_ps(rinvB, rinvB);
208 rinv4 = _mm_mul_ps(rinv2, rinv2);
209 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
210 rinv6 = _mm_mul_ps(rinv4, rinv2);
211 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
213 rvdw = _mm_add_ps(rai, raj);
214 rvdwB = _mm_add_ps(rai, rajB);
215 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
216 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
218 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
219 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
221 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
222 if (0 == _mm_movemask_ps(mask_cmp) )
224 /* if ratio>still_p5inv for ALL elements */
226 dccf = _mm_setzero_ps();
230 ratio = _mm_min_ps(ratio, still_p5inv);
231 theta = _mm_mul_ps(ratio, still_pip5);
232 gmx_mm_sincos_ps(theta, &sinq, &cosq);
233 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
234 ccf = _mm_mul_ps(term, term);
235 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
236 _mm_mul_ps(sinq, theta));
238 if (0 == _mm_movemask_ps(mask_cmpB) )
240 /* if ratio>still_p5inv for ALL elements */
242 dccfB = _mm_setzero_ps();
246 ratioB = _mm_min_ps(ratioB, still_p5inv);
247 thetaB = _mm_mul_ps(ratioB, still_pip5);
248 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
249 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
250 ccfB = _mm_mul_ps(termB, termB);
251 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
252 _mm_mul_ps(sinqB, thetaB));
255 prod = _mm_mul_ps(still_p4, vaj);
256 prodB = _mm_mul_ps(still_p4, vajB);
257 icf4 = _mm_mul_ps(ccf, rinv4);
258 icf4B = _mm_mul_ps(ccfB, rinv4B);
259 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
260 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
262 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
263 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
265 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
267 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
269 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
271 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
273 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
277 for (; k < nj1-offset; k += 4)
289 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
291 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
292 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
294 dx = _mm_sub_ps(ix, jx);
295 dy = _mm_sub_ps(iy, jy);
296 dz = _mm_sub_ps(iz, jz);
298 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
299 rinv = gmx_mm_invsqrt_ps(rsq);
300 rinv2 = _mm_mul_ps(rinv, rinv);
301 rinv4 = _mm_mul_ps(rinv2, rinv2);
302 rinv6 = _mm_mul_ps(rinv4, rinv2);
304 rvdw = _mm_add_ps(rai, raj);
305 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
307 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
309 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
310 if (0 == _mm_movemask_ps(mask_cmp))
312 /* if ratio>still_p5inv for ALL elements */
314 dccf = _mm_setzero_ps();
318 ratio = _mm_min_ps(ratio, still_p5inv);
319 theta = _mm_mul_ps(ratio, still_pip5);
320 gmx_mm_sincos_ps(theta, &sinq, &cosq);
321 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
322 ccf = _mm_mul_ps(term, term);
323 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
324 _mm_mul_ps(sinq, theta));
327 prod = _mm_mul_ps(still_p4, vaj);
328 icf4 = _mm_mul_ps(ccf, rinv4);
329 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
331 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
333 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
335 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
337 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
347 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
348 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
349 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
352 else if (offset == 2)
358 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
359 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
360 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
365 /* offset must be 3 */
372 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
373 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
374 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
378 dx = _mm_sub_ps(ix, jx);
379 dy = _mm_sub_ps(iy, jy);
380 dz = _mm_sub_ps(iz, jz);
382 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
383 rinv = gmx_mm_invsqrt_ps(rsq);
384 rinv2 = _mm_mul_ps(rinv, rinv);
385 rinv4 = _mm_mul_ps(rinv2, rinv2);
386 rinv6 = _mm_mul_ps(rinv4, rinv2);
388 rvdw = _mm_add_ps(rai, raj);
389 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
391 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
393 if (0 == _mm_movemask_ps(mask_cmp))
395 /* if ratio>still_p5inv for ALL elements */
397 dccf = _mm_setzero_ps();
401 ratio = _mm_min_ps(ratio, still_p5inv);
402 theta = _mm_mul_ps(ratio, still_pip5);
403 gmx_mm_sincos_ps(theta, &sinq, &cosq);
404 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
405 ccf = _mm_mul_ps(term, term);
406 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
407 _mm_mul_ps(sinq, theta));
410 prod = _mm_mul_ps(still_p4, vaj);
411 icf4 = _mm_mul_ps(ccf, rinv4);
412 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
414 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
416 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
418 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
421 tmp = _mm_mul_ps(prod_ai, icf4);
425 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
427 else if (offset == 2)
429 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
433 /* offset must be 3 */
434 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
437 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
440 /* Sum up the polarization energy from other nodes */
443 gmx_sum(natoms, work, cr);
445 else if (DOMAINDECOMP(cr))
447 dd_atom_sum_real(cr->dd, work);
450 /* Compute the radii */
451 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
453 if (born->use[i] != 0)
455 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
456 gpi2 = gpi_ai * gpi_ai;
457 born->bRad[i] = factor*gmx_invsqrt(gpi2);
458 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
462 /* Extra (local) communication required for DD */
463 if (DOMAINDECOMP(cr))
465 dd_atom_spread_real(cr->dd, born->bRad);
466 dd_atom_spread_real(cr->dd, fr->invsqrta);
474 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
475 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
477 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
478 int jnrA, jnrB, jnrC, jnrD;
479 int j3A, j3B, j3C, j3D;
480 int jnrE, jnrF, jnrG, jnrH;
481 int j3E, j3F, j3G, j3H;
483 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
484 float sum_ai2, sum_ai3, tsum, tchain, doffset;
493 __m128 ix, iy, iz, jx, jy, jz;
494 __m128 dx, dy, dz, t1, t2, t3, t4;
496 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
497 __m128 uij, lij2, uij2, lij3, uij3, diff2;
498 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
499 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
503 __m128 obc_mask1, obc_mask2, obc_mask3;
504 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
505 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
506 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
507 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
508 __m128 lij_invB, sk2_invB, prodB;
509 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
510 __m128 dadx1B, dadx2B;
512 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
514 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
515 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
516 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
518 __m128 oneeighth = _mm_set1_ps(0.125);
519 __m128 onefourth = _mm_set1_ps(0.25);
521 const __m128 half = _mm_set1_ps(0.5f);
522 const __m128 three = _mm_set1_ps(3.0f);
523 const __m128 one = _mm_set1_ps(1.0f);
524 const __m128 two = _mm_set1_ps(2.0f);
525 const __m128 zero = _mm_set1_ps(0.0f);
526 const __m128 neg = _mm_set1_ps(-1.0f);
528 /* Set the dielectric offset */
529 doffset = born->gb_doffset;
530 gb_radius = born->gb_radius;
531 obc_param = born->param;
532 work = born->gpol_hct_work;
535 shiftvec = fr->shift_vec[0];
537 jx = _mm_setzero_ps();
538 jy = _mm_setzero_ps();
539 jz = _mm_setzero_ps();
541 jnrA = jnrB = jnrC = jnrD = 0;
543 for (i = 0; i < born->nr; i++)
548 for (i = 0; i < nl->nri; i++)
552 is3 = 3*nl->shift[i];
554 shY = shiftvec[is3+1];
555 shZ = shiftvec[is3+2];
557 nj1 = nl->jindex[i+1];
559 ix = _mm_set1_ps(shX+x[ii3+0]);
560 iy = _mm_set1_ps(shY+x[ii3+1]);
561 iz = _mm_set1_ps(shZ+x[ii3+2]);
563 offset = (nj1-nj0)%4;
565 rai = _mm_load1_ps(gb_radius+ii);
566 rai_inv = gmx_mm_inv_ps(rai);
568 sum_ai = _mm_setzero_ps();
570 sk_ai = _mm_load1_ps(born->param+ii);
571 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
573 for (k = nj0; k < nj1-4-offset; k += 8)
593 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
594 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
595 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
596 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
597 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
598 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
600 dx = _mm_sub_ps(ix, jx);
601 dy = _mm_sub_ps(iy, jy);
602 dz = _mm_sub_ps(iz, jz);
603 dxB = _mm_sub_ps(ix, jxB);
604 dyB = _mm_sub_ps(iy, jyB);
605 dzB = _mm_sub_ps(iz, jzB);
607 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
608 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
610 rinv = gmx_mm_invsqrt_ps(rsq);
611 r = _mm_mul_ps(rsq, rinv);
612 rinvB = gmx_mm_invsqrt_ps(rsqB);
613 rB = _mm_mul_ps(rsqB, rinvB);
615 /* Compute raj_inv aj1-4 */
616 raj_inv = gmx_mm_inv_ps(raj);
617 raj_invB = gmx_mm_inv_ps(rajB);
619 /* Evaluate influence of atom aj -> ai */
620 t1 = _mm_add_ps(r, sk_aj);
621 t2 = _mm_sub_ps(r, sk_aj);
622 t3 = _mm_sub_ps(sk_aj, r);
623 t1B = _mm_add_ps(rB, sk_ajB);
624 t2B = _mm_sub_ps(rB, sk_ajB);
625 t3B = _mm_sub_ps(sk_ajB, rB);
626 obc_mask1 = _mm_cmplt_ps(rai, t1);
627 obc_mask2 = _mm_cmplt_ps(rai, t2);
628 obc_mask3 = _mm_cmplt_ps(rai, t3);
629 obc_mask1B = _mm_cmplt_ps(rai, t1B);
630 obc_mask2B = _mm_cmplt_ps(rai, t2B);
631 obc_mask3B = _mm_cmplt_ps(rai, t3B);
633 uij = gmx_mm_inv_ps(t1);
634 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
635 _mm_andnot_ps(obc_mask2, rai_inv));
636 dlij = _mm_and_ps(one, obc_mask2);
637 uij2 = _mm_mul_ps(uij, uij);
638 uij3 = _mm_mul_ps(uij2, uij);
639 lij2 = _mm_mul_ps(lij, lij);
640 lij3 = _mm_mul_ps(lij2, lij);
642 uijB = gmx_mm_inv_ps(t1B);
643 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
644 _mm_andnot_ps(obc_mask2B, rai_inv));
645 dlijB = _mm_and_ps(one, obc_mask2B);
646 uij2B = _mm_mul_ps(uijB, uijB);
647 uij3B = _mm_mul_ps(uij2B, uijB);
648 lij2B = _mm_mul_ps(lijB, lijB);
649 lij3B = _mm_mul_ps(lij2B, lijB);
651 diff2 = _mm_sub_ps(uij2, lij2);
652 lij_inv = gmx_mm_invsqrt_ps(lij2);
653 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
654 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
655 prod = _mm_mul_ps(onefourth, sk2_rinv);
657 diff2B = _mm_sub_ps(uij2B, lij2B);
658 lij_invB = gmx_mm_invsqrt_ps(lij2B);
659 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
660 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
661 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
663 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
664 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
666 t1 = _mm_sub_ps(lij, uij);
667 t2 = _mm_mul_ps(diff2,
668 _mm_sub_ps(_mm_mul_ps(onefourth, r),
670 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
671 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
672 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
673 t4 = _mm_and_ps(t4, obc_mask3);
674 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
676 t1B = _mm_sub_ps(lijB, uijB);
677 t2B = _mm_mul_ps(diff2B,
678 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
680 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
681 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
682 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
683 t4B = _mm_and_ps(t4B, obc_mask3B);
684 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
686 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
688 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
689 _mm_mul_ps(prod, lij3));
691 _mm_mul_ps(onefourth,
692 _mm_add_ps(_mm_mul_ps(lij, rinv),
693 _mm_mul_ps(lij3, r))));
694 t2 = _mm_mul_ps(onefourth,
695 _mm_add_ps(_mm_mul_ps(uij, rinv),
696 _mm_mul_ps(uij3, r)));
698 _mm_add_ps(_mm_mul_ps(half, uij2),
699 _mm_mul_ps(prod, uij3)));
700 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
701 _mm_mul_ps(rinv, rinv));
703 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
705 _mm_mul_ps(sk2_rinv, rinv))));
706 t1 = _mm_mul_ps(rinv,
707 _mm_add_ps(_mm_mul_ps(dlij, t1),
708 _mm_add_ps(t2, t3)));
712 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
713 _mm_mul_ps(prodB, lij3B));
714 t1B = _mm_sub_ps(t1B,
715 _mm_mul_ps(onefourth,
716 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
717 _mm_mul_ps(lij3B, rB))));
718 t2B = _mm_mul_ps(onefourth,
719 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
720 _mm_mul_ps(uij3B, rB)));
721 t2B = _mm_sub_ps(t2B,
722 _mm_add_ps(_mm_mul_ps(half, uij2B),
723 _mm_mul_ps(prodB, uij3B)));
724 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
725 _mm_mul_ps(rinvB, rinvB));
726 t3B = _mm_sub_ps(t3B,
727 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
729 _mm_mul_ps(sk2_rinvB, rinvB))));
730 t1B = _mm_mul_ps(rinvB,
731 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
732 _mm_add_ps(t2B, t3B)));
734 dadx1 = _mm_and_ps(t1, obc_mask1);
735 dadx1B = _mm_and_ps(t1B, obc_mask1B);
738 /* Evaluate influence of atom ai -> aj */
739 t1 = _mm_add_ps(r, sk_ai);
740 t2 = _mm_sub_ps(r, sk_ai);
741 t3 = _mm_sub_ps(sk_ai, r);
742 t1B = _mm_add_ps(rB, sk_ai);
743 t2B = _mm_sub_ps(rB, sk_ai);
744 t3B = _mm_sub_ps(sk_ai, rB);
745 obc_mask1 = _mm_cmplt_ps(raj, t1);
746 obc_mask2 = _mm_cmplt_ps(raj, t2);
747 obc_mask3 = _mm_cmplt_ps(raj, t3);
748 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
749 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
750 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
752 uij = gmx_mm_inv_ps(t1);
753 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
754 _mm_andnot_ps(obc_mask2, raj_inv));
755 dlij = _mm_and_ps(one, obc_mask2);
756 uij2 = _mm_mul_ps(uij, uij);
757 uij3 = _mm_mul_ps(uij2, uij);
758 lij2 = _mm_mul_ps(lij, lij);
759 lij3 = _mm_mul_ps(lij2, lij);
761 uijB = gmx_mm_inv_ps(t1B);
762 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
763 _mm_andnot_ps(obc_mask2B, raj_invB));
764 dlijB = _mm_and_ps(one, obc_mask2B);
765 uij2B = _mm_mul_ps(uijB, uijB);
766 uij3B = _mm_mul_ps(uij2B, uijB);
767 lij2B = _mm_mul_ps(lijB, lijB);
768 lij3B = _mm_mul_ps(lij2B, lijB);
770 diff2 = _mm_sub_ps(uij2, lij2);
771 lij_inv = gmx_mm_invsqrt_ps(lij2);
772 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
773 prod = _mm_mul_ps(onefourth, sk2_rinv);
775 diff2B = _mm_sub_ps(uij2B, lij2B);
776 lij_invB = gmx_mm_invsqrt_ps(lij2B);
777 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
778 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
780 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
781 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
783 t1 = _mm_sub_ps(lij, uij);
784 t2 = _mm_mul_ps(diff2,
785 _mm_sub_ps(_mm_mul_ps(onefourth, r),
787 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
788 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
789 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
790 t4 = _mm_and_ps(t4, obc_mask3);
791 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
793 t1B = _mm_sub_ps(lijB, uijB);
794 t2B = _mm_mul_ps(diff2B,
795 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
797 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
798 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
799 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
800 t4B = _mm_and_ps(t4B, obc_mask3B);
801 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
803 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
804 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
806 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
807 _mm_mul_ps(prod, lij3));
809 _mm_mul_ps(onefourth,
810 _mm_add_ps(_mm_mul_ps(lij, rinv),
811 _mm_mul_ps(lij3, r))));
812 t2 = _mm_mul_ps(onefourth,
813 _mm_add_ps(_mm_mul_ps(uij, rinv),
814 _mm_mul_ps(uij3, r)));
816 _mm_add_ps(_mm_mul_ps(half, uij2),
817 _mm_mul_ps(prod, uij3)));
818 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
819 _mm_mul_ps(rinv, rinv));
821 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
823 _mm_mul_ps(sk2_rinv, rinv))));
824 t1 = _mm_mul_ps(rinv,
825 _mm_add_ps(_mm_mul_ps(dlij, t1),
826 _mm_add_ps(t2, t3)));
829 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
830 _mm_mul_ps(prodB, lij3B));
831 t1B = _mm_sub_ps(t1B,
832 _mm_mul_ps(onefourth,
833 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
834 _mm_mul_ps(lij3B, rB))));
835 t2B = _mm_mul_ps(onefourth,
836 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
837 _mm_mul_ps(uij3B, rB)));
838 t2B = _mm_sub_ps(t2B,
839 _mm_add_ps(_mm_mul_ps(half, uij2B),
840 _mm_mul_ps(prodB, uij3B)));
841 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
842 _mm_mul_ps(rinvB, rinvB));
843 t3B = _mm_sub_ps(t3B,
844 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
846 _mm_mul_ps(sk2_rinvB, rinvB))));
847 t1B = _mm_mul_ps(rinvB,
848 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
849 _mm_add_ps(t2B, t3B)));
852 dadx2 = _mm_and_ps(t1, obc_mask1);
853 dadx2B = _mm_and_ps(t1B, obc_mask1B);
855 _mm_store_ps(dadx, dadx1);
857 _mm_store_ps(dadx, dadx2);
859 _mm_store_ps(dadx, dadx1B);
861 _mm_store_ps(dadx, dadx2B);
864 } /* end normal inner loop */
866 for (; k < nj1-offset; k += 4)
878 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
879 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
880 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
882 dx = _mm_sub_ps(ix, jx);
883 dy = _mm_sub_ps(iy, jy);
884 dz = _mm_sub_ps(iz, jz);
886 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
888 rinv = gmx_mm_invsqrt_ps(rsq);
889 r = _mm_mul_ps(rsq, rinv);
891 /* Compute raj_inv aj1-4 */
892 raj_inv = gmx_mm_inv_ps(raj);
894 /* Evaluate influence of atom aj -> ai */
895 t1 = _mm_add_ps(r, sk_aj);
896 obc_mask1 = _mm_cmplt_ps(rai, t1);
898 if (_mm_movemask_ps(obc_mask1))
900 /* If any of the elements has rai<dr+sk, this is executed */
901 t2 = _mm_sub_ps(r, sk_aj);
902 t3 = _mm_sub_ps(sk_aj, r);
904 obc_mask2 = _mm_cmplt_ps(rai, t2);
905 obc_mask3 = _mm_cmplt_ps(rai, t3);
907 uij = gmx_mm_inv_ps(t1);
908 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
909 _mm_andnot_ps(obc_mask2, rai_inv));
910 dlij = _mm_and_ps(one, obc_mask2);
911 uij2 = _mm_mul_ps(uij, uij);
912 uij3 = _mm_mul_ps(uij2, uij);
913 lij2 = _mm_mul_ps(lij, lij);
914 lij3 = _mm_mul_ps(lij2, lij);
915 diff2 = _mm_sub_ps(uij2, lij2);
916 lij_inv = gmx_mm_invsqrt_ps(lij2);
917 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
918 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
919 prod = _mm_mul_ps(onefourth, sk2_rinv);
920 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
921 t1 = _mm_sub_ps(lij, uij);
922 t2 = _mm_mul_ps(diff2,
923 _mm_sub_ps(_mm_mul_ps(onefourth, r),
925 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
926 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
927 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
928 t4 = _mm_and_ps(t4, obc_mask3);
929 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
930 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
931 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
932 _mm_mul_ps(prod, lij3));
934 _mm_mul_ps(onefourth,
935 _mm_add_ps(_mm_mul_ps(lij, rinv),
936 _mm_mul_ps(lij3, r))));
937 t2 = _mm_mul_ps(onefourth,
938 _mm_add_ps(_mm_mul_ps(uij, rinv),
939 _mm_mul_ps(uij3, r)));
941 _mm_add_ps(_mm_mul_ps(half, uij2),
942 _mm_mul_ps(prod, uij3)));
943 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
944 _mm_mul_ps(rinv, rinv));
946 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
948 _mm_mul_ps(sk2_rinv, rinv))));
949 t1 = _mm_mul_ps(rinv,
950 _mm_add_ps(_mm_mul_ps(dlij, t1),
951 _mm_add_ps(t2, t3)));
953 dadx1 = _mm_and_ps(t1, obc_mask1);
957 dadx1 = _mm_setzero_ps();
960 /* Evaluate influence of atom ai -> aj */
961 t1 = _mm_add_ps(r, sk_ai);
962 obc_mask1 = _mm_cmplt_ps(raj, t1);
964 if (_mm_movemask_ps(obc_mask1))
966 t2 = _mm_sub_ps(r, sk_ai);
967 t3 = _mm_sub_ps(sk_ai, r);
968 obc_mask2 = _mm_cmplt_ps(raj, t2);
969 obc_mask3 = _mm_cmplt_ps(raj, t3);
971 uij = gmx_mm_inv_ps(t1);
972 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
973 _mm_andnot_ps(obc_mask2, raj_inv));
974 dlij = _mm_and_ps(one, obc_mask2);
975 uij2 = _mm_mul_ps(uij, uij);
976 uij3 = _mm_mul_ps(uij2, uij);
977 lij2 = _mm_mul_ps(lij, lij);
978 lij3 = _mm_mul_ps(lij2, lij);
979 diff2 = _mm_sub_ps(uij2, lij2);
980 lij_inv = gmx_mm_invsqrt_ps(lij2);
981 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
982 prod = _mm_mul_ps(onefourth, sk2_rinv);
983 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
984 t1 = _mm_sub_ps(lij, uij);
985 t2 = _mm_mul_ps(diff2,
986 _mm_sub_ps(_mm_mul_ps(onefourth, r),
988 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
989 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
990 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
991 t4 = _mm_and_ps(t4, obc_mask3);
992 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
994 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
996 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
997 _mm_mul_ps(prod, lij3));
999 _mm_mul_ps(onefourth,
1000 _mm_add_ps(_mm_mul_ps(lij, rinv),
1001 _mm_mul_ps(lij3, r))));
1002 t2 = _mm_mul_ps(onefourth,
1003 _mm_add_ps(_mm_mul_ps(uij, rinv),
1004 _mm_mul_ps(uij3, r)));
1006 _mm_add_ps(_mm_mul_ps(half, uij2),
1007 _mm_mul_ps(prod, uij3)));
1008 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1009 _mm_mul_ps(rinv, rinv));
1011 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1013 _mm_mul_ps(sk2_rinv, rinv))));
1014 t1 = _mm_mul_ps(rinv,
1015 _mm_add_ps(_mm_mul_ps(dlij, t1),
1016 _mm_add_ps(t2, t3)));
1017 dadx2 = _mm_and_ps(t1, obc_mask1);
1021 dadx2 = _mm_setzero_ps();
1024 _mm_store_ps(dadx, dadx1);
1026 _mm_store_ps(dadx, dadx2);
1028 } /* end normal inner loop */
1036 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1037 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1038 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1041 else if (offset == 2)
1047 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1048 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1049 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1054 /* offset must be 3 */
1061 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1062 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1063 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1067 dx = _mm_sub_ps(ix, jx);
1068 dy = _mm_sub_ps(iy, jy);
1069 dz = _mm_sub_ps(iz, jz);
1071 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1073 rinv = gmx_mm_invsqrt_ps(rsq);
1074 r = _mm_mul_ps(rsq, rinv);
1076 /* Compute raj_inv aj1-4 */
1077 raj_inv = gmx_mm_inv_ps(raj);
1079 /* Evaluate influence of atom aj -> ai */
1080 t1 = _mm_add_ps(r, sk_aj);
1081 obc_mask1 = _mm_cmplt_ps(rai, t1);
1082 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1084 if (_mm_movemask_ps(obc_mask1))
1086 t2 = _mm_sub_ps(r, sk_aj);
1087 t3 = _mm_sub_ps(sk_aj, r);
1088 obc_mask2 = _mm_cmplt_ps(rai, t2);
1089 obc_mask3 = _mm_cmplt_ps(rai, t3);
1091 uij = gmx_mm_inv_ps(t1);
1092 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1093 _mm_andnot_ps(obc_mask2, rai_inv));
1094 dlij = _mm_and_ps(one, obc_mask2);
1095 uij2 = _mm_mul_ps(uij, uij);
1096 uij3 = _mm_mul_ps(uij2, uij);
1097 lij2 = _mm_mul_ps(lij, lij);
1098 lij3 = _mm_mul_ps(lij2, lij);
1099 diff2 = _mm_sub_ps(uij2, lij2);
1100 lij_inv = gmx_mm_invsqrt_ps(lij2);
1101 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1102 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1103 prod = _mm_mul_ps(onefourth, sk2_rinv);
1104 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1105 t1 = _mm_sub_ps(lij, uij);
1106 t2 = _mm_mul_ps(diff2,
1107 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1109 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1110 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1111 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1112 t4 = _mm_and_ps(t4, obc_mask3);
1113 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1114 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1115 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1116 _mm_mul_ps(prod, lij3));
1118 _mm_mul_ps(onefourth,
1119 _mm_add_ps(_mm_mul_ps(lij, rinv),
1120 _mm_mul_ps(lij3, r))));
1121 t2 = _mm_mul_ps(onefourth,
1122 _mm_add_ps(_mm_mul_ps(uij, rinv),
1123 _mm_mul_ps(uij3, r)));
1125 _mm_add_ps(_mm_mul_ps(half, uij2),
1126 _mm_mul_ps(prod, uij3)));
1127 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1128 _mm_mul_ps(rinv, rinv));
1130 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1132 _mm_mul_ps(sk2_rinv, rinv))));
1133 t1 = _mm_mul_ps(rinv,
1134 _mm_add_ps(_mm_mul_ps(dlij, t1),
1135 _mm_add_ps(t2, t3)));
1136 dadx1 = _mm_and_ps(t1, obc_mask1);
1140 dadx1 = _mm_setzero_ps();
1143 /* Evaluate influence of atom ai -> aj */
1144 t1 = _mm_add_ps(r, sk_ai);
1145 obc_mask1 = _mm_cmplt_ps(raj, t1);
1146 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1148 if (_mm_movemask_ps(obc_mask1))
1150 t2 = _mm_sub_ps(r, sk_ai);
1151 t3 = _mm_sub_ps(sk_ai, r);
1152 obc_mask2 = _mm_cmplt_ps(raj, t2);
1153 obc_mask3 = _mm_cmplt_ps(raj, t3);
1155 uij = gmx_mm_inv_ps(t1);
1156 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1157 _mm_andnot_ps(obc_mask2, raj_inv));
1158 dlij = _mm_and_ps(one, obc_mask2);
1159 uij2 = _mm_mul_ps(uij, uij);
1160 uij3 = _mm_mul_ps(uij2, uij);
1161 lij2 = _mm_mul_ps(lij, lij);
1162 lij3 = _mm_mul_ps(lij2, lij);
1163 diff2 = _mm_sub_ps(uij2, lij2);
1164 lij_inv = gmx_mm_invsqrt_ps(lij2);
1165 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1166 prod = _mm_mul_ps(onefourth, sk2_rinv);
1167 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1168 t1 = _mm_sub_ps(lij, uij);
1169 t2 = _mm_mul_ps(diff2,
1170 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1172 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1173 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1174 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1175 t4 = _mm_and_ps(t4, obc_mask3);
1176 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1178 tmp = _mm_and_ps(t1, obc_mask1);
1180 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1181 _mm_mul_ps(prod, lij3));
1183 _mm_mul_ps(onefourth,
1184 _mm_add_ps(_mm_mul_ps(lij, rinv),
1185 _mm_mul_ps(lij3, r))));
1186 t2 = _mm_mul_ps(onefourth,
1187 _mm_add_ps(_mm_mul_ps(uij, rinv),
1188 _mm_mul_ps(uij3, r)));
1190 _mm_add_ps(_mm_mul_ps(half, uij2),
1191 _mm_mul_ps(prod, uij3)));
1192 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1193 _mm_mul_ps(rinv, rinv));
1195 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1197 _mm_mul_ps(sk2_rinv, rinv))));
1198 t1 = _mm_mul_ps(rinv,
1199 _mm_add_ps(_mm_mul_ps(dlij, t1),
1200 _mm_add_ps(t2, t3)));
1201 dadx2 = _mm_and_ps(t1, obc_mask1);
1205 dadx2 = _mm_setzero_ps();
1206 tmp = _mm_setzero_ps();
1209 _mm_store_ps(dadx, dadx1);
1211 _mm_store_ps(dadx, dadx2);
1216 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1218 else if (offset == 2)
1220 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1224 /* offset must be 3 */
1225 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1229 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1233 /* Parallel summations */
1236 gmx_sum(natoms, work, cr);
1238 else if (DOMAINDECOMP(cr))
1240 dd_atom_sum_real(cr->dd, work);
1243 if (gb_algorithm == egbHCT)
1246 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1248 if (born->use[i] != 0)
1250 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1251 sum = 1.0/rr - work[i];
1252 min_rad = rr + doffset;
1255 born->bRad[i] = rad > min_rad ? rad : min_rad;
1256 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1260 /* Extra communication required for DD */
1261 if (DOMAINDECOMP(cr))
1263 dd_atom_spread_real(cr->dd, born->bRad);
1264 dd_atom_spread_real(cr->dd, fr->invsqrta);
1270 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1272 if (born->use[i] != 0)
1274 rr = top->atomtypes.gb_radius[md->typeA[i]];
1282 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1283 born->bRad[i] = rr_inv - tsum*rr_inv2;
1284 born->bRad[i] = 1.0 / born->bRad[i];
1286 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1288 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1289 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1292 /* Extra (local) communication required for DD */
1293 if (DOMAINDECOMP(cr))
1295 dd_atom_spread_real(cr->dd, born->bRad);
1296 dd_atom_spread_real(cr->dd, fr->invsqrta);
1297 dd_atom_spread_real(cr->dd, born->drobc);
1308 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1309 float *x, float *f, float *fshift, float *shiftvec,
1310 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1312 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1313 int jnrA, jnrB, jnrC, jnrD;
1314 int j3A, j3B, j3C, j3D;
1315 int jnrE, jnrF, jnrG, jnrH;
1316 int j3E, j3F, j3G, j3H;
1319 float rbi, shX, shY, shZ;
1324 __m128 jxB, jyB, jzB;
1325 __m128 fix, fiy, fiz;
1328 __m128 dxB, dyB, dzB;
1329 __m128 txB, tyB, tzB;
1331 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1332 __m128 xmm1, xmm2, xmm3;
1334 const __m128 two = _mm_set1_ps(2.0f);
1340 /* Loop to get the proper form for the Born radius term, sse style */
1346 if (gb_algorithm == egbSTILL)
1348 for (i = n0; i < n1; i++)
1350 rbi = born->bRad[i];
1351 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1354 else if (gb_algorithm == egbHCT)
1356 for (i = n0; i < n1; i++)
1358 rbi = born->bRad[i];
1359 rb[i] = rbi * rbi * dvda[i];
1362 else if (gb_algorithm == egbOBC)
1364 for (i = n0; i < n1; i++)
1366 rbi = born->bRad[i];
1367 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1371 jz = _mm_setzero_ps();
1373 n = j3A = j3B = j3C = j3D = 0;
1375 for (i = 0; i < nl->nri; i++)
1379 is3 = 3*nl->shift[i];
1380 shX = shiftvec[is3];
1381 shY = shiftvec[is3+1];
1382 shZ = shiftvec[is3+2];
1383 nj0 = nl->jindex[i];
1384 nj1 = nl->jindex[i+1];
1386 ix = _mm_set1_ps(shX+x[ii3+0]);
1387 iy = _mm_set1_ps(shY+x[ii3+1]);
1388 iz = _mm_set1_ps(shZ+x[ii3+2]);
1390 offset = (nj1-nj0)%4;
1392 rbai = _mm_load1_ps(rb+ii);
1393 fix = _mm_setzero_ps();
1394 fiy = _mm_setzero_ps();
1395 fiz = _mm_setzero_ps();
1398 for (k = nj0; k < nj1-offset; k += 4)
1410 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1412 dx = _mm_sub_ps(ix, jx);
1413 dy = _mm_sub_ps(iy, jy);
1414 dz = _mm_sub_ps(iz, jz);
1416 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1418 /* load chain rule terms for j1-4 */
1419 f_gb = _mm_load_ps(dadx);
1421 f_gb_ai = _mm_load_ps(dadx);
1424 /* calculate scalar force */
1425 f_gb = _mm_mul_ps(f_gb, rbai);
1426 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1427 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1429 tx = _mm_mul_ps(f_gb, dx);
1430 ty = _mm_mul_ps(f_gb, dy);
1431 tz = _mm_mul_ps(f_gb, dz);
1433 fix = _mm_add_ps(fix, tx);
1434 fiy = _mm_add_ps(fiy, ty);
1435 fiz = _mm_add_ps(fiz, tz);
1437 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1440 /*deal with odd elements */
1447 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1448 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1450 else if (offset == 2)
1456 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1457 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1461 /* offset must be 3 */
1468 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1469 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1472 dx = _mm_sub_ps(ix, jx);
1473 dy = _mm_sub_ps(iy, jy);
1474 dz = _mm_sub_ps(iz, jz);
1476 /* load chain rule terms for j1-4 */
1477 f_gb = _mm_load_ps(dadx);
1479 f_gb_ai = _mm_load_ps(dadx);
1482 /* calculate scalar force */
1483 f_gb = _mm_mul_ps(f_gb, rbai);
1484 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1485 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1487 tx = _mm_mul_ps(f_gb, dx);
1488 ty = _mm_mul_ps(f_gb, dy);
1489 tz = _mm_mul_ps(f_gb, dz);
1491 fix = _mm_add_ps(fix, tx);
1492 fiy = _mm_add_ps(fiy, ty);
1493 fiz = _mm_add_ps(fiz, tz);
1497 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1499 else if (offset == 2)
1501 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1505 /* offset must be 3 */
1506 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1510 /* fix/fiy/fiz now contain four partial force terms, that all should be
1511 * added to the i particle forces and shift forces.
1513 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1521 /* keep compiler happy */
1522 int genborn_sse_dummy;
1524 #endif /* SSE intrinsics available */