2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
49 #include "gromacs/fileio/pdbio.h"
55 #include "gmx_fatal.h"
56 #include "mtop_util.h"
59 #include "gromacs/utility/gmxmpi.h"
62 /* Only compile this file if SSE intrinsics are available */
63 #if 0 && defined (GMX_X86_SSE2)
65 #include <gmx_sse2_single.h>
66 #include <emmintrin.h>
68 #include "genborn_sse2_single.h"
72 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
73 int natoms, gmx_localtop_t *top,
74 float *x, t_nblist *nl,
77 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
78 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
79 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
100 __m128 rsq, rinv, rinv2, rinv4, rinv6;
101 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
102 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
103 __m128 ratioB, rajB, vajB, rvdwB;
104 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
105 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
106 __m128 mask, icf4, icf6, mask_cmp;
107 __m128 icf4B, icf6B, mask_cmpB;
109 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
110 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
111 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
113 const __m128 half = _mm_set1_ps(0.5f);
114 const __m128 three = _mm_set1_ps(3.0f);
115 const __m128 one = _mm_set1_ps(1.0f);
116 const __m128 two = _mm_set1_ps(2.0f);
117 const __m128 zero = _mm_set1_ps(0.0f);
118 const __m128 four = _mm_set1_ps(4.0f);
120 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
121 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
122 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
124 factor = 0.5 * ONE_4PI_EPS0;
126 gb_radius = born->gb_radius;
128 work = born->gpol_still_work;
130 shiftvec = fr->shift_vec[0];
133 jnrA = jnrB = jnrC = jnrD = 0;
134 jx = _mm_setzero_ps();
135 jy = _mm_setzero_ps();
136 jz = _mm_setzero_ps();
140 for (i = 0; i < natoms; i++)
145 for (i = 0; i < nl->nri; i++)
149 is3 = 3*nl->shift[i];
151 shY = shiftvec[is3+1];
152 shZ = shiftvec[is3+2];
154 nj1 = nl->jindex[i+1];
156 ix = _mm_set1_ps(shX+x[ii3+0]);
157 iy = _mm_set1_ps(shY+x[ii3+1]);
158 iz = _mm_set1_ps(shZ+x[ii3+2]);
160 offset = (nj1-nj0)%4;
162 /* Polarization energy for atom ai */
163 gpi = _mm_setzero_ps();
165 rai = _mm_load1_ps(gb_radius+ii);
166 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
168 for (k = nj0; k < nj1-4-offset; k += 8)
188 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
189 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
191 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
192 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
193 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
194 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
196 dx = _mm_sub_ps(ix, jx);
197 dy = _mm_sub_ps(iy, jy);
198 dz = _mm_sub_ps(iz, jz);
199 dxB = _mm_sub_ps(ix, jxB);
200 dyB = _mm_sub_ps(iy, jyB);
201 dzB = _mm_sub_ps(iz, jzB);
203 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
204 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
205 rinv = gmx_mm_invsqrt_ps(rsq);
206 rinvB = gmx_mm_invsqrt_ps(rsqB);
207 rinv2 = _mm_mul_ps(rinv, rinv);
208 rinv2B = _mm_mul_ps(rinvB, rinvB);
209 rinv4 = _mm_mul_ps(rinv2, rinv2);
210 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
211 rinv6 = _mm_mul_ps(rinv4, rinv2);
212 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
214 rvdw = _mm_add_ps(rai, raj);
215 rvdwB = _mm_add_ps(rai, rajB);
216 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
217 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
219 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
220 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
222 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
223 if (0 == _mm_movemask_ps(mask_cmp) )
225 /* if ratio>still_p5inv for ALL elements */
227 dccf = _mm_setzero_ps();
231 ratio = _mm_min_ps(ratio, still_p5inv);
232 theta = _mm_mul_ps(ratio, still_pip5);
233 gmx_mm_sincos_ps(theta, &sinq, &cosq);
234 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
235 ccf = _mm_mul_ps(term, term);
236 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
237 _mm_mul_ps(sinq, theta));
239 if (0 == _mm_movemask_ps(mask_cmpB) )
241 /* if ratio>still_p5inv for ALL elements */
243 dccfB = _mm_setzero_ps();
247 ratioB = _mm_min_ps(ratioB, still_p5inv);
248 thetaB = _mm_mul_ps(ratioB, still_pip5);
249 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
250 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
251 ccfB = _mm_mul_ps(termB, termB);
252 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
253 _mm_mul_ps(sinqB, thetaB));
256 prod = _mm_mul_ps(still_p4, vaj);
257 prodB = _mm_mul_ps(still_p4, vajB);
258 icf4 = _mm_mul_ps(ccf, rinv4);
259 icf4B = _mm_mul_ps(ccfB, rinv4B);
260 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
261 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
263 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
264 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
266 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
268 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
270 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
272 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
274 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
278 for (; k < nj1-offset; k += 4)
290 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
292 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
293 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
295 dx = _mm_sub_ps(ix, jx);
296 dy = _mm_sub_ps(iy, jy);
297 dz = _mm_sub_ps(iz, jz);
299 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
300 rinv = gmx_mm_invsqrt_ps(rsq);
301 rinv2 = _mm_mul_ps(rinv, rinv);
302 rinv4 = _mm_mul_ps(rinv2, rinv2);
303 rinv6 = _mm_mul_ps(rinv4, rinv2);
305 rvdw = _mm_add_ps(rai, raj);
306 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
308 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
310 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
311 if (0 == _mm_movemask_ps(mask_cmp))
313 /* if ratio>still_p5inv for ALL elements */
315 dccf = _mm_setzero_ps();
319 ratio = _mm_min_ps(ratio, still_p5inv);
320 theta = _mm_mul_ps(ratio, still_pip5);
321 gmx_mm_sincos_ps(theta, &sinq, &cosq);
322 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
323 ccf = _mm_mul_ps(term, term);
324 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
325 _mm_mul_ps(sinq, theta));
328 prod = _mm_mul_ps(still_p4, vaj);
329 icf4 = _mm_mul_ps(ccf, rinv4);
330 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
332 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
334 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
336 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
338 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
348 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
349 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
350 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
353 else if (offset == 2)
359 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
360 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
361 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
366 /* offset must be 3 */
373 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
374 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
375 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
379 dx = _mm_sub_ps(ix, jx);
380 dy = _mm_sub_ps(iy, jy);
381 dz = _mm_sub_ps(iz, jz);
383 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
384 rinv = gmx_mm_invsqrt_ps(rsq);
385 rinv2 = _mm_mul_ps(rinv, rinv);
386 rinv4 = _mm_mul_ps(rinv2, rinv2);
387 rinv6 = _mm_mul_ps(rinv4, rinv2);
389 rvdw = _mm_add_ps(rai, raj);
390 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
392 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
394 if (0 == _mm_movemask_ps(mask_cmp))
396 /* if ratio>still_p5inv for ALL elements */
398 dccf = _mm_setzero_ps();
402 ratio = _mm_min_ps(ratio, still_p5inv);
403 theta = _mm_mul_ps(ratio, still_pip5);
404 gmx_mm_sincos_ps(theta, &sinq, &cosq);
405 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
406 ccf = _mm_mul_ps(term, term);
407 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
408 _mm_mul_ps(sinq, theta));
411 prod = _mm_mul_ps(still_p4, vaj);
412 icf4 = _mm_mul_ps(ccf, rinv4);
413 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
415 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
417 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
419 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
422 tmp = _mm_mul_ps(prod_ai, icf4);
426 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
428 else if (offset == 2)
430 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
434 /* offset must be 3 */
435 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
438 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
441 /* Sum up the polarization energy from other nodes */
444 gmx_sum(natoms, work, cr);
446 else if (DOMAINDECOMP(cr))
448 dd_atom_sum_real(cr->dd, work);
451 /* Compute the radii */
452 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
454 if (born->use[i] != 0)
456 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
457 gpi2 = gpi_ai * gpi_ai;
458 born->bRad[i] = factor*gmx_invsqrt(gpi2);
459 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
463 /* Extra (local) communication required for DD */
464 if (DOMAINDECOMP(cr))
466 dd_atom_spread_real(cr->dd, born->bRad);
467 dd_atom_spread_real(cr->dd, fr->invsqrta);
475 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
476 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
478 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
479 int jnrA, jnrB, jnrC, jnrD;
480 int j3A, j3B, j3C, j3D;
481 int jnrE, jnrF, jnrG, jnrH;
482 int j3E, j3F, j3G, j3H;
484 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
485 float sum_ai2, sum_ai3, tsum, tchain, doffset;
494 __m128 ix, iy, iz, jx, jy, jz;
495 __m128 dx, dy, dz, t1, t2, t3, t4;
497 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
498 __m128 uij, lij2, uij2, lij3, uij3, diff2;
499 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
500 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
504 __m128 obc_mask1, obc_mask2, obc_mask3;
505 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
506 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
507 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
508 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
509 __m128 lij_invB, sk2_invB, prodB;
510 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
511 __m128 dadx1B, dadx2B;
513 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
515 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
516 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
517 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
519 __m128 oneeighth = _mm_set1_ps(0.125);
520 __m128 onefourth = _mm_set1_ps(0.25);
522 const __m128 half = _mm_set1_ps(0.5f);
523 const __m128 three = _mm_set1_ps(3.0f);
524 const __m128 one = _mm_set1_ps(1.0f);
525 const __m128 two = _mm_set1_ps(2.0f);
526 const __m128 zero = _mm_set1_ps(0.0f);
527 const __m128 neg = _mm_set1_ps(-1.0f);
529 /* Set the dielectric offset */
530 doffset = born->gb_doffset;
531 gb_radius = born->gb_radius;
532 obc_param = born->param;
533 work = born->gpol_hct_work;
536 shiftvec = fr->shift_vec[0];
538 jx = _mm_setzero_ps();
539 jy = _mm_setzero_ps();
540 jz = _mm_setzero_ps();
542 jnrA = jnrB = jnrC = jnrD = 0;
544 for (i = 0; i < born->nr; i++)
549 for (i = 0; i < nl->nri; i++)
553 is3 = 3*nl->shift[i];
555 shY = shiftvec[is3+1];
556 shZ = shiftvec[is3+2];
558 nj1 = nl->jindex[i+1];
560 ix = _mm_set1_ps(shX+x[ii3+0]);
561 iy = _mm_set1_ps(shY+x[ii3+1]);
562 iz = _mm_set1_ps(shZ+x[ii3+2]);
564 offset = (nj1-nj0)%4;
566 rai = _mm_load1_ps(gb_radius+ii);
567 rai_inv = gmx_mm_inv_ps(rai);
569 sum_ai = _mm_setzero_ps();
571 sk_ai = _mm_load1_ps(born->param+ii);
572 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
574 for (k = nj0; k < nj1-4-offset; k += 8)
594 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
595 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
596 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
597 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
598 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
599 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
601 dx = _mm_sub_ps(ix, jx);
602 dy = _mm_sub_ps(iy, jy);
603 dz = _mm_sub_ps(iz, jz);
604 dxB = _mm_sub_ps(ix, jxB);
605 dyB = _mm_sub_ps(iy, jyB);
606 dzB = _mm_sub_ps(iz, jzB);
608 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
609 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
611 rinv = gmx_mm_invsqrt_ps(rsq);
612 r = _mm_mul_ps(rsq, rinv);
613 rinvB = gmx_mm_invsqrt_ps(rsqB);
614 rB = _mm_mul_ps(rsqB, rinvB);
616 /* Compute raj_inv aj1-4 */
617 raj_inv = gmx_mm_inv_ps(raj);
618 raj_invB = gmx_mm_inv_ps(rajB);
620 /* Evaluate influence of atom aj -> ai */
621 t1 = _mm_add_ps(r, sk_aj);
622 t2 = _mm_sub_ps(r, sk_aj);
623 t3 = _mm_sub_ps(sk_aj, r);
624 t1B = _mm_add_ps(rB, sk_ajB);
625 t2B = _mm_sub_ps(rB, sk_ajB);
626 t3B = _mm_sub_ps(sk_ajB, rB);
627 obc_mask1 = _mm_cmplt_ps(rai, t1);
628 obc_mask2 = _mm_cmplt_ps(rai, t2);
629 obc_mask3 = _mm_cmplt_ps(rai, t3);
630 obc_mask1B = _mm_cmplt_ps(rai, t1B);
631 obc_mask2B = _mm_cmplt_ps(rai, t2B);
632 obc_mask3B = _mm_cmplt_ps(rai, t3B);
634 uij = gmx_mm_inv_ps(t1);
635 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
636 _mm_andnot_ps(obc_mask2, rai_inv));
637 dlij = _mm_and_ps(one, obc_mask2);
638 uij2 = _mm_mul_ps(uij, uij);
639 uij3 = _mm_mul_ps(uij2, uij);
640 lij2 = _mm_mul_ps(lij, lij);
641 lij3 = _mm_mul_ps(lij2, lij);
643 uijB = gmx_mm_inv_ps(t1B);
644 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
645 _mm_andnot_ps(obc_mask2B, rai_inv));
646 dlijB = _mm_and_ps(one, obc_mask2B);
647 uij2B = _mm_mul_ps(uijB, uijB);
648 uij3B = _mm_mul_ps(uij2B, uijB);
649 lij2B = _mm_mul_ps(lijB, lijB);
650 lij3B = _mm_mul_ps(lij2B, lijB);
652 diff2 = _mm_sub_ps(uij2, lij2);
653 lij_inv = gmx_mm_invsqrt_ps(lij2);
654 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
655 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
656 prod = _mm_mul_ps(onefourth, sk2_rinv);
658 diff2B = _mm_sub_ps(uij2B, lij2B);
659 lij_invB = gmx_mm_invsqrt_ps(lij2B);
660 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
661 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
662 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
664 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
665 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
667 t1 = _mm_sub_ps(lij, uij);
668 t2 = _mm_mul_ps(diff2,
669 _mm_sub_ps(_mm_mul_ps(onefourth, r),
671 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
672 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
673 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
674 t4 = _mm_and_ps(t4, obc_mask3);
675 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
677 t1B = _mm_sub_ps(lijB, uijB);
678 t2B = _mm_mul_ps(diff2B,
679 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
681 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
682 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
683 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
684 t4B = _mm_and_ps(t4B, obc_mask3B);
685 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
687 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
689 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
690 _mm_mul_ps(prod, lij3));
692 _mm_mul_ps(onefourth,
693 _mm_add_ps(_mm_mul_ps(lij, rinv),
694 _mm_mul_ps(lij3, r))));
695 t2 = _mm_mul_ps(onefourth,
696 _mm_add_ps(_mm_mul_ps(uij, rinv),
697 _mm_mul_ps(uij3, r)));
699 _mm_add_ps(_mm_mul_ps(half, uij2),
700 _mm_mul_ps(prod, uij3)));
701 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
702 _mm_mul_ps(rinv, rinv));
704 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
706 _mm_mul_ps(sk2_rinv, rinv))));
707 t1 = _mm_mul_ps(rinv,
708 _mm_add_ps(_mm_mul_ps(dlij, t1),
709 _mm_add_ps(t2, t3)));
713 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
714 _mm_mul_ps(prodB, lij3B));
715 t1B = _mm_sub_ps(t1B,
716 _mm_mul_ps(onefourth,
717 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
718 _mm_mul_ps(lij3B, rB))));
719 t2B = _mm_mul_ps(onefourth,
720 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
721 _mm_mul_ps(uij3B, rB)));
722 t2B = _mm_sub_ps(t2B,
723 _mm_add_ps(_mm_mul_ps(half, uij2B),
724 _mm_mul_ps(prodB, uij3B)));
725 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
726 _mm_mul_ps(rinvB, rinvB));
727 t3B = _mm_sub_ps(t3B,
728 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
730 _mm_mul_ps(sk2_rinvB, rinvB))));
731 t1B = _mm_mul_ps(rinvB,
732 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
733 _mm_add_ps(t2B, t3B)));
735 dadx1 = _mm_and_ps(t1, obc_mask1);
736 dadx1B = _mm_and_ps(t1B, obc_mask1B);
739 /* Evaluate influence of atom ai -> aj */
740 t1 = _mm_add_ps(r, sk_ai);
741 t2 = _mm_sub_ps(r, sk_ai);
742 t3 = _mm_sub_ps(sk_ai, r);
743 t1B = _mm_add_ps(rB, sk_ai);
744 t2B = _mm_sub_ps(rB, sk_ai);
745 t3B = _mm_sub_ps(sk_ai, rB);
746 obc_mask1 = _mm_cmplt_ps(raj, t1);
747 obc_mask2 = _mm_cmplt_ps(raj, t2);
748 obc_mask3 = _mm_cmplt_ps(raj, t3);
749 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
750 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
751 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
753 uij = gmx_mm_inv_ps(t1);
754 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
755 _mm_andnot_ps(obc_mask2, raj_inv));
756 dlij = _mm_and_ps(one, obc_mask2);
757 uij2 = _mm_mul_ps(uij, uij);
758 uij3 = _mm_mul_ps(uij2, uij);
759 lij2 = _mm_mul_ps(lij, lij);
760 lij3 = _mm_mul_ps(lij2, lij);
762 uijB = gmx_mm_inv_ps(t1B);
763 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
764 _mm_andnot_ps(obc_mask2B, raj_invB));
765 dlijB = _mm_and_ps(one, obc_mask2B);
766 uij2B = _mm_mul_ps(uijB, uijB);
767 uij3B = _mm_mul_ps(uij2B, uijB);
768 lij2B = _mm_mul_ps(lijB, lijB);
769 lij3B = _mm_mul_ps(lij2B, lijB);
771 diff2 = _mm_sub_ps(uij2, lij2);
772 lij_inv = gmx_mm_invsqrt_ps(lij2);
773 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
774 prod = _mm_mul_ps(onefourth, sk2_rinv);
776 diff2B = _mm_sub_ps(uij2B, lij2B);
777 lij_invB = gmx_mm_invsqrt_ps(lij2B);
778 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
779 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
781 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
782 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
784 t1 = _mm_sub_ps(lij, uij);
785 t2 = _mm_mul_ps(diff2,
786 _mm_sub_ps(_mm_mul_ps(onefourth, r),
788 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
789 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
790 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
791 t4 = _mm_and_ps(t4, obc_mask3);
792 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
794 t1B = _mm_sub_ps(lijB, uijB);
795 t2B = _mm_mul_ps(diff2B,
796 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
798 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
799 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
800 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
801 t4B = _mm_and_ps(t4B, obc_mask3B);
802 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
804 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
805 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
807 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
808 _mm_mul_ps(prod, lij3));
810 _mm_mul_ps(onefourth,
811 _mm_add_ps(_mm_mul_ps(lij, rinv),
812 _mm_mul_ps(lij3, r))));
813 t2 = _mm_mul_ps(onefourth,
814 _mm_add_ps(_mm_mul_ps(uij, rinv),
815 _mm_mul_ps(uij3, r)));
817 _mm_add_ps(_mm_mul_ps(half, uij2),
818 _mm_mul_ps(prod, uij3)));
819 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
820 _mm_mul_ps(rinv, rinv));
822 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
824 _mm_mul_ps(sk2_rinv, rinv))));
825 t1 = _mm_mul_ps(rinv,
826 _mm_add_ps(_mm_mul_ps(dlij, t1),
827 _mm_add_ps(t2, t3)));
830 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
831 _mm_mul_ps(prodB, lij3B));
832 t1B = _mm_sub_ps(t1B,
833 _mm_mul_ps(onefourth,
834 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
835 _mm_mul_ps(lij3B, rB))));
836 t2B = _mm_mul_ps(onefourth,
837 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
838 _mm_mul_ps(uij3B, rB)));
839 t2B = _mm_sub_ps(t2B,
840 _mm_add_ps(_mm_mul_ps(half, uij2B),
841 _mm_mul_ps(prodB, uij3B)));
842 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
843 _mm_mul_ps(rinvB, rinvB));
844 t3B = _mm_sub_ps(t3B,
845 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
847 _mm_mul_ps(sk2_rinvB, rinvB))));
848 t1B = _mm_mul_ps(rinvB,
849 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
850 _mm_add_ps(t2B, t3B)));
853 dadx2 = _mm_and_ps(t1, obc_mask1);
854 dadx2B = _mm_and_ps(t1B, obc_mask1B);
856 _mm_store_ps(dadx, dadx1);
858 _mm_store_ps(dadx, dadx2);
860 _mm_store_ps(dadx, dadx1B);
862 _mm_store_ps(dadx, dadx2B);
865 } /* end normal inner loop */
867 for (; k < nj1-offset; k += 4)
879 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
880 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
881 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
883 dx = _mm_sub_ps(ix, jx);
884 dy = _mm_sub_ps(iy, jy);
885 dz = _mm_sub_ps(iz, jz);
887 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
889 rinv = gmx_mm_invsqrt_ps(rsq);
890 r = _mm_mul_ps(rsq, rinv);
892 /* Compute raj_inv aj1-4 */
893 raj_inv = gmx_mm_inv_ps(raj);
895 /* Evaluate influence of atom aj -> ai */
896 t1 = _mm_add_ps(r, sk_aj);
897 obc_mask1 = _mm_cmplt_ps(rai, t1);
899 if (_mm_movemask_ps(obc_mask1))
901 /* If any of the elements has rai<dr+sk, this is executed */
902 t2 = _mm_sub_ps(r, sk_aj);
903 t3 = _mm_sub_ps(sk_aj, r);
905 obc_mask2 = _mm_cmplt_ps(rai, t2);
906 obc_mask3 = _mm_cmplt_ps(rai, t3);
908 uij = gmx_mm_inv_ps(t1);
909 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
910 _mm_andnot_ps(obc_mask2, rai_inv));
911 dlij = _mm_and_ps(one, obc_mask2);
912 uij2 = _mm_mul_ps(uij, uij);
913 uij3 = _mm_mul_ps(uij2, uij);
914 lij2 = _mm_mul_ps(lij, lij);
915 lij3 = _mm_mul_ps(lij2, lij);
916 diff2 = _mm_sub_ps(uij2, lij2);
917 lij_inv = gmx_mm_invsqrt_ps(lij2);
918 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
919 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
920 prod = _mm_mul_ps(onefourth, sk2_rinv);
921 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
922 t1 = _mm_sub_ps(lij, uij);
923 t2 = _mm_mul_ps(diff2,
924 _mm_sub_ps(_mm_mul_ps(onefourth, r),
926 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
927 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
928 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
929 t4 = _mm_and_ps(t4, obc_mask3);
930 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
931 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
932 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
933 _mm_mul_ps(prod, lij3));
935 _mm_mul_ps(onefourth,
936 _mm_add_ps(_mm_mul_ps(lij, rinv),
937 _mm_mul_ps(lij3, r))));
938 t2 = _mm_mul_ps(onefourth,
939 _mm_add_ps(_mm_mul_ps(uij, rinv),
940 _mm_mul_ps(uij3, r)));
942 _mm_add_ps(_mm_mul_ps(half, uij2),
943 _mm_mul_ps(prod, uij3)));
944 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
945 _mm_mul_ps(rinv, rinv));
947 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
949 _mm_mul_ps(sk2_rinv, rinv))));
950 t1 = _mm_mul_ps(rinv,
951 _mm_add_ps(_mm_mul_ps(dlij, t1),
952 _mm_add_ps(t2, t3)));
954 dadx1 = _mm_and_ps(t1, obc_mask1);
958 dadx1 = _mm_setzero_ps();
961 /* Evaluate influence of atom ai -> aj */
962 t1 = _mm_add_ps(r, sk_ai);
963 obc_mask1 = _mm_cmplt_ps(raj, t1);
965 if (_mm_movemask_ps(obc_mask1))
967 t2 = _mm_sub_ps(r, sk_ai);
968 t3 = _mm_sub_ps(sk_ai, r);
969 obc_mask2 = _mm_cmplt_ps(raj, t2);
970 obc_mask3 = _mm_cmplt_ps(raj, t3);
972 uij = gmx_mm_inv_ps(t1);
973 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
974 _mm_andnot_ps(obc_mask2, raj_inv));
975 dlij = _mm_and_ps(one, obc_mask2);
976 uij2 = _mm_mul_ps(uij, uij);
977 uij3 = _mm_mul_ps(uij2, uij);
978 lij2 = _mm_mul_ps(lij, lij);
979 lij3 = _mm_mul_ps(lij2, lij);
980 diff2 = _mm_sub_ps(uij2, lij2);
981 lij_inv = gmx_mm_invsqrt_ps(lij2);
982 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
983 prod = _mm_mul_ps(onefourth, sk2_rinv);
984 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
985 t1 = _mm_sub_ps(lij, uij);
986 t2 = _mm_mul_ps(diff2,
987 _mm_sub_ps(_mm_mul_ps(onefourth, r),
989 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
990 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
991 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
992 t4 = _mm_and_ps(t4, obc_mask3);
993 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
995 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
997 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
998 _mm_mul_ps(prod, lij3));
1000 _mm_mul_ps(onefourth,
1001 _mm_add_ps(_mm_mul_ps(lij, rinv),
1002 _mm_mul_ps(lij3, r))));
1003 t2 = _mm_mul_ps(onefourth,
1004 _mm_add_ps(_mm_mul_ps(uij, rinv),
1005 _mm_mul_ps(uij3, r)));
1007 _mm_add_ps(_mm_mul_ps(half, uij2),
1008 _mm_mul_ps(prod, uij3)));
1009 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1010 _mm_mul_ps(rinv, rinv));
1012 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1014 _mm_mul_ps(sk2_rinv, rinv))));
1015 t1 = _mm_mul_ps(rinv,
1016 _mm_add_ps(_mm_mul_ps(dlij, t1),
1017 _mm_add_ps(t2, t3)));
1018 dadx2 = _mm_and_ps(t1, obc_mask1);
1022 dadx2 = _mm_setzero_ps();
1025 _mm_store_ps(dadx, dadx1);
1027 _mm_store_ps(dadx, dadx2);
1029 } /* end normal inner loop */
1037 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1038 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1039 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1042 else if (offset == 2)
1048 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1049 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1050 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1055 /* offset must be 3 */
1062 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1063 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1064 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1068 dx = _mm_sub_ps(ix, jx);
1069 dy = _mm_sub_ps(iy, jy);
1070 dz = _mm_sub_ps(iz, jz);
1072 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1074 rinv = gmx_mm_invsqrt_ps(rsq);
1075 r = _mm_mul_ps(rsq, rinv);
1077 /* Compute raj_inv aj1-4 */
1078 raj_inv = gmx_mm_inv_ps(raj);
1080 /* Evaluate influence of atom aj -> ai */
1081 t1 = _mm_add_ps(r, sk_aj);
1082 obc_mask1 = _mm_cmplt_ps(rai, t1);
1083 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1085 if (_mm_movemask_ps(obc_mask1))
1087 t2 = _mm_sub_ps(r, sk_aj);
1088 t3 = _mm_sub_ps(sk_aj, r);
1089 obc_mask2 = _mm_cmplt_ps(rai, t2);
1090 obc_mask3 = _mm_cmplt_ps(rai, t3);
1092 uij = gmx_mm_inv_ps(t1);
1093 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1094 _mm_andnot_ps(obc_mask2, rai_inv));
1095 dlij = _mm_and_ps(one, obc_mask2);
1096 uij2 = _mm_mul_ps(uij, uij);
1097 uij3 = _mm_mul_ps(uij2, uij);
1098 lij2 = _mm_mul_ps(lij, lij);
1099 lij3 = _mm_mul_ps(lij2, lij);
1100 diff2 = _mm_sub_ps(uij2, lij2);
1101 lij_inv = gmx_mm_invsqrt_ps(lij2);
1102 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1103 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1104 prod = _mm_mul_ps(onefourth, sk2_rinv);
1105 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1106 t1 = _mm_sub_ps(lij, uij);
1107 t2 = _mm_mul_ps(diff2,
1108 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1110 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1111 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1112 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1113 t4 = _mm_and_ps(t4, obc_mask3);
1114 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1115 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1116 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1117 _mm_mul_ps(prod, lij3));
1119 _mm_mul_ps(onefourth,
1120 _mm_add_ps(_mm_mul_ps(lij, rinv),
1121 _mm_mul_ps(lij3, r))));
1122 t2 = _mm_mul_ps(onefourth,
1123 _mm_add_ps(_mm_mul_ps(uij, rinv),
1124 _mm_mul_ps(uij3, r)));
1126 _mm_add_ps(_mm_mul_ps(half, uij2),
1127 _mm_mul_ps(prod, uij3)));
1128 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1129 _mm_mul_ps(rinv, rinv));
1131 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1133 _mm_mul_ps(sk2_rinv, rinv))));
1134 t1 = _mm_mul_ps(rinv,
1135 _mm_add_ps(_mm_mul_ps(dlij, t1),
1136 _mm_add_ps(t2, t3)));
1137 dadx1 = _mm_and_ps(t1, obc_mask1);
1141 dadx1 = _mm_setzero_ps();
1144 /* Evaluate influence of atom ai -> aj */
1145 t1 = _mm_add_ps(r, sk_ai);
1146 obc_mask1 = _mm_cmplt_ps(raj, t1);
1147 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1149 if (_mm_movemask_ps(obc_mask1))
1151 t2 = _mm_sub_ps(r, sk_ai);
1152 t3 = _mm_sub_ps(sk_ai, r);
1153 obc_mask2 = _mm_cmplt_ps(raj, t2);
1154 obc_mask3 = _mm_cmplt_ps(raj, t3);
1156 uij = gmx_mm_inv_ps(t1);
1157 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1158 _mm_andnot_ps(obc_mask2, raj_inv));
1159 dlij = _mm_and_ps(one, obc_mask2);
1160 uij2 = _mm_mul_ps(uij, uij);
1161 uij3 = _mm_mul_ps(uij2, uij);
1162 lij2 = _mm_mul_ps(lij, lij);
1163 lij3 = _mm_mul_ps(lij2, lij);
1164 diff2 = _mm_sub_ps(uij2, lij2);
1165 lij_inv = gmx_mm_invsqrt_ps(lij2);
1166 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1167 prod = _mm_mul_ps(onefourth, sk2_rinv);
1168 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1169 t1 = _mm_sub_ps(lij, uij);
1170 t2 = _mm_mul_ps(diff2,
1171 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1173 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1174 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1175 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1176 t4 = _mm_and_ps(t4, obc_mask3);
1177 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1179 tmp = _mm_and_ps(t1, obc_mask1);
1181 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1182 _mm_mul_ps(prod, lij3));
1184 _mm_mul_ps(onefourth,
1185 _mm_add_ps(_mm_mul_ps(lij, rinv),
1186 _mm_mul_ps(lij3, r))));
1187 t2 = _mm_mul_ps(onefourth,
1188 _mm_add_ps(_mm_mul_ps(uij, rinv),
1189 _mm_mul_ps(uij3, r)));
1191 _mm_add_ps(_mm_mul_ps(half, uij2),
1192 _mm_mul_ps(prod, uij3)));
1193 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1194 _mm_mul_ps(rinv, rinv));
1196 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1198 _mm_mul_ps(sk2_rinv, rinv))));
1199 t1 = _mm_mul_ps(rinv,
1200 _mm_add_ps(_mm_mul_ps(dlij, t1),
1201 _mm_add_ps(t2, t3)));
1202 dadx2 = _mm_and_ps(t1, obc_mask1);
1206 dadx2 = _mm_setzero_ps();
1207 tmp = _mm_setzero_ps();
1210 _mm_store_ps(dadx, dadx1);
1212 _mm_store_ps(dadx, dadx2);
1217 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1219 else if (offset == 2)
1221 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1225 /* offset must be 3 */
1226 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1230 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1234 /* Parallel summations */
1237 gmx_sum(natoms, work, cr);
1239 else if (DOMAINDECOMP(cr))
1241 dd_atom_sum_real(cr->dd, work);
1244 if (gb_algorithm == egbHCT)
1247 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1249 if (born->use[i] != 0)
1251 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1252 sum = 1.0/rr - work[i];
1253 min_rad = rr + doffset;
1256 born->bRad[i] = rad > min_rad ? rad : min_rad;
1257 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1261 /* Extra communication required for DD */
1262 if (DOMAINDECOMP(cr))
1264 dd_atom_spread_real(cr->dd, born->bRad);
1265 dd_atom_spread_real(cr->dd, fr->invsqrta);
1271 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1273 if (born->use[i] != 0)
1275 rr = top->atomtypes.gb_radius[md->typeA[i]];
1283 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1284 born->bRad[i] = rr_inv - tsum*rr_inv2;
1285 born->bRad[i] = 1.0 / born->bRad[i];
1287 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1289 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1290 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1293 /* Extra (local) communication required for DD */
1294 if (DOMAINDECOMP(cr))
1296 dd_atom_spread_real(cr->dd, born->bRad);
1297 dd_atom_spread_real(cr->dd, fr->invsqrta);
1298 dd_atom_spread_real(cr->dd, born->drobc);
1309 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1310 float *x, float *f, float *fshift, float *shiftvec,
1311 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1313 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1314 int jnrA, jnrB, jnrC, jnrD;
1315 int j3A, j3B, j3C, j3D;
1316 int jnrE, jnrF, jnrG, jnrH;
1317 int j3E, j3F, j3G, j3H;
1320 float rbi, shX, shY, shZ;
1325 __m128 jxB, jyB, jzB;
1326 __m128 fix, fiy, fiz;
1329 __m128 dxB, dyB, dzB;
1330 __m128 txB, tyB, tzB;
1332 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1333 __m128 xmm1, xmm2, xmm3;
1335 const __m128 two = _mm_set1_ps(2.0f);
1341 /* Loop to get the proper form for the Born radius term, sse style */
1347 if (gb_algorithm == egbSTILL)
1349 for (i = n0; i < n1; i++)
1351 rbi = born->bRad[i];
1352 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1355 else if (gb_algorithm == egbHCT)
1357 for (i = n0; i < n1; i++)
1359 rbi = born->bRad[i];
1360 rb[i] = rbi * rbi * dvda[i];
1363 else if (gb_algorithm == egbOBC)
1365 for (i = n0; i < n1; i++)
1367 rbi = born->bRad[i];
1368 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1372 jz = _mm_setzero_ps();
1374 n = j3A = j3B = j3C = j3D = 0;
1376 for (i = 0; i < nl->nri; i++)
1380 is3 = 3*nl->shift[i];
1381 shX = shiftvec[is3];
1382 shY = shiftvec[is3+1];
1383 shZ = shiftvec[is3+2];
1384 nj0 = nl->jindex[i];
1385 nj1 = nl->jindex[i+1];
1387 ix = _mm_set1_ps(shX+x[ii3+0]);
1388 iy = _mm_set1_ps(shY+x[ii3+1]);
1389 iz = _mm_set1_ps(shZ+x[ii3+2]);
1391 offset = (nj1-nj0)%4;
1393 rbai = _mm_load1_ps(rb+ii);
1394 fix = _mm_setzero_ps();
1395 fiy = _mm_setzero_ps();
1396 fiz = _mm_setzero_ps();
1399 for (k = nj0; k < nj1-offset; k += 4)
1411 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1413 dx = _mm_sub_ps(ix, jx);
1414 dy = _mm_sub_ps(iy, jy);
1415 dz = _mm_sub_ps(iz, jz);
1417 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1419 /* load chain rule terms for j1-4 */
1420 f_gb = _mm_load_ps(dadx);
1422 f_gb_ai = _mm_load_ps(dadx);
1425 /* calculate scalar force */
1426 f_gb = _mm_mul_ps(f_gb, rbai);
1427 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1428 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1430 tx = _mm_mul_ps(f_gb, dx);
1431 ty = _mm_mul_ps(f_gb, dy);
1432 tz = _mm_mul_ps(f_gb, dz);
1434 fix = _mm_add_ps(fix, tx);
1435 fiy = _mm_add_ps(fiy, ty);
1436 fiz = _mm_add_ps(fiz, tz);
1438 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1441 /*deal with odd elements */
1448 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1449 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1451 else if (offset == 2)
1457 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1458 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1462 /* offset must be 3 */
1469 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1470 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1473 dx = _mm_sub_ps(ix, jx);
1474 dy = _mm_sub_ps(iy, jy);
1475 dz = _mm_sub_ps(iz, jz);
1477 /* load chain rule terms for j1-4 */
1478 f_gb = _mm_load_ps(dadx);
1480 f_gb_ai = _mm_load_ps(dadx);
1483 /* calculate scalar force */
1484 f_gb = _mm_mul_ps(f_gb, rbai);
1485 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1486 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1488 tx = _mm_mul_ps(f_gb, dx);
1489 ty = _mm_mul_ps(f_gb, dy);
1490 tz = _mm_mul_ps(f_gb, dz);
1492 fix = _mm_add_ps(fix, tx);
1493 fiy = _mm_add_ps(fiy, ty);
1494 fiz = _mm_add_ps(fiz, tz);
1498 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1500 else if (offset == 2)
1502 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1506 /* offset must be 3 */
1507 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1511 /* fix/fiy/fiz now contain four partial force terms, that all should be
1512 * added to the i particle forces and shift forces.
1514 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1522 /* keep compiler happy */
1523 int genborn_sse_dummy;
1525 #endif /* SSE intrinsics available */