2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
45 #include "gromacs/utility/smalloc.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/fileio/pdbio.h"
50 #include "gromacs/math/units.h"
53 #include "gromacs/utility/fatalerror.h"
56 #include "gromacs/utility/gmxmpi.h"
59 /* Only compile this file if SSE intrinsics are available */
60 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
62 #include <gmx_sse2_single.h>
63 #include <emmintrin.h>
65 #include "genborn_sse2_single.h"
69 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
70 int natoms, gmx_localtop_t *top,
71 float *x, t_nblist *nl,
74 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
75 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
76 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
97 __m128 rsq, rinv, rinv2, rinv4, rinv6;
98 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
99 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
100 __m128 ratioB, rajB, vajB, rvdwB;
101 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
102 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
103 __m128 mask, icf4, icf6, mask_cmp;
104 __m128 icf4B, icf6B, mask_cmpB;
106 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
107 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
108 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
110 const __m128 half = _mm_set1_ps(0.5f);
111 const __m128 three = _mm_set1_ps(3.0f);
112 const __m128 one = _mm_set1_ps(1.0f);
113 const __m128 two = _mm_set1_ps(2.0f);
114 const __m128 zero = _mm_set1_ps(0.0f);
115 const __m128 four = _mm_set1_ps(4.0f);
117 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
118 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
119 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
121 factor = 0.5 * ONE_4PI_EPS0;
123 gb_radius = born->gb_radius;
125 work = born->gpol_still_work;
127 shiftvec = fr->shift_vec[0];
130 jnrA = jnrB = jnrC = jnrD = 0;
131 jx = _mm_setzero_ps();
132 jy = _mm_setzero_ps();
133 jz = _mm_setzero_ps();
137 for (i = 0; i < natoms; i++)
142 for (i = 0; i < nl->nri; i++)
146 is3 = 3*nl->shift[i];
148 shY = shiftvec[is3+1];
149 shZ = shiftvec[is3+2];
151 nj1 = nl->jindex[i+1];
153 ix = _mm_set1_ps(shX+x[ii3+0]);
154 iy = _mm_set1_ps(shY+x[ii3+1]);
155 iz = _mm_set1_ps(shZ+x[ii3+2]);
157 offset = (nj1-nj0)%4;
159 /* Polarization energy for atom ai */
160 gpi = _mm_setzero_ps();
162 rai = _mm_load1_ps(gb_radius+ii);
163 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
165 for (k = nj0; k < nj1-4-offset; k += 8)
185 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
186 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
188 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
189 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
190 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
191 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
193 dx = _mm_sub_ps(ix, jx);
194 dy = _mm_sub_ps(iy, jy);
195 dz = _mm_sub_ps(iz, jz);
196 dxB = _mm_sub_ps(ix, jxB);
197 dyB = _mm_sub_ps(iy, jyB);
198 dzB = _mm_sub_ps(iz, jzB);
200 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
201 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
202 rinv = gmx_mm_invsqrt_ps(rsq);
203 rinvB = gmx_mm_invsqrt_ps(rsqB);
204 rinv2 = _mm_mul_ps(rinv, rinv);
205 rinv2B = _mm_mul_ps(rinvB, rinvB);
206 rinv4 = _mm_mul_ps(rinv2, rinv2);
207 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
208 rinv6 = _mm_mul_ps(rinv4, rinv2);
209 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
211 rvdw = _mm_add_ps(rai, raj);
212 rvdwB = _mm_add_ps(rai, rajB);
213 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
214 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
216 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
217 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
219 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
220 if (0 == _mm_movemask_ps(mask_cmp) )
222 /* if ratio>still_p5inv for ALL elements */
224 dccf = _mm_setzero_ps();
228 ratio = _mm_min_ps(ratio, still_p5inv);
229 theta = _mm_mul_ps(ratio, still_pip5);
230 gmx_mm_sincos_ps(theta, &sinq, &cosq);
231 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
232 ccf = _mm_mul_ps(term, term);
233 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
234 _mm_mul_ps(sinq, theta));
236 if (0 == _mm_movemask_ps(mask_cmpB) )
238 /* if ratio>still_p5inv for ALL elements */
240 dccfB = _mm_setzero_ps();
244 ratioB = _mm_min_ps(ratioB, still_p5inv);
245 thetaB = _mm_mul_ps(ratioB, still_pip5);
246 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
247 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
248 ccfB = _mm_mul_ps(termB, termB);
249 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
250 _mm_mul_ps(sinqB, thetaB));
253 prod = _mm_mul_ps(still_p4, vaj);
254 prodB = _mm_mul_ps(still_p4, vajB);
255 icf4 = _mm_mul_ps(ccf, rinv4);
256 icf4B = _mm_mul_ps(ccfB, rinv4B);
257 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
258 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
260 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
261 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
263 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
265 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
267 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
269 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
271 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
275 for (; k < nj1-offset; k += 4)
287 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
289 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
290 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
292 dx = _mm_sub_ps(ix, jx);
293 dy = _mm_sub_ps(iy, jy);
294 dz = _mm_sub_ps(iz, jz);
296 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
297 rinv = gmx_mm_invsqrt_ps(rsq);
298 rinv2 = _mm_mul_ps(rinv, rinv);
299 rinv4 = _mm_mul_ps(rinv2, rinv2);
300 rinv6 = _mm_mul_ps(rinv4, rinv2);
302 rvdw = _mm_add_ps(rai, raj);
303 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
305 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
307 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
308 if (0 == _mm_movemask_ps(mask_cmp))
310 /* if ratio>still_p5inv for ALL elements */
312 dccf = _mm_setzero_ps();
316 ratio = _mm_min_ps(ratio, still_p5inv);
317 theta = _mm_mul_ps(ratio, still_pip5);
318 gmx_mm_sincos_ps(theta, &sinq, &cosq);
319 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
320 ccf = _mm_mul_ps(term, term);
321 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
322 _mm_mul_ps(sinq, theta));
325 prod = _mm_mul_ps(still_p4, vaj);
326 icf4 = _mm_mul_ps(ccf, rinv4);
327 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
329 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
331 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
333 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
335 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
345 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
346 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
347 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
350 else if (offset == 2)
356 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
357 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
358 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
363 /* offset must be 3 */
370 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
371 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
372 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
376 dx = _mm_sub_ps(ix, jx);
377 dy = _mm_sub_ps(iy, jy);
378 dz = _mm_sub_ps(iz, jz);
380 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
381 rinv = gmx_mm_invsqrt_ps(rsq);
382 rinv2 = _mm_mul_ps(rinv, rinv);
383 rinv4 = _mm_mul_ps(rinv2, rinv2);
384 rinv6 = _mm_mul_ps(rinv4, rinv2);
386 rvdw = _mm_add_ps(rai, raj);
387 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
389 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
391 if (0 == _mm_movemask_ps(mask_cmp))
393 /* if ratio>still_p5inv for ALL elements */
395 dccf = _mm_setzero_ps();
399 ratio = _mm_min_ps(ratio, still_p5inv);
400 theta = _mm_mul_ps(ratio, still_pip5);
401 gmx_mm_sincos_ps(theta, &sinq, &cosq);
402 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
403 ccf = _mm_mul_ps(term, term);
404 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
405 _mm_mul_ps(sinq, theta));
408 prod = _mm_mul_ps(still_p4, vaj);
409 icf4 = _mm_mul_ps(ccf, rinv4);
410 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
412 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
414 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
416 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
419 tmp = _mm_mul_ps(prod_ai, icf4);
423 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
425 else if (offset == 2)
427 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
431 /* offset must be 3 */
432 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
435 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
438 /* Sum up the polarization energy from other nodes */
439 if (DOMAINDECOMP(cr))
441 dd_atom_sum_real(cr->dd, work);
444 /* Compute the radii */
445 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
447 if (born->use[i] != 0)
449 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
450 gpi2 = gpi_ai * gpi_ai;
451 born->bRad[i] = factor*gmx_invsqrt(gpi2);
452 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
456 /* Extra (local) communication required for DD */
457 if (DOMAINDECOMP(cr))
459 dd_atom_spread_real(cr->dd, born->bRad);
460 dd_atom_spread_real(cr->dd, fr->invsqrta);
468 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
469 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
471 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
472 int jnrA, jnrB, jnrC, jnrD;
473 int j3A, j3B, j3C, j3D;
474 int jnrE, jnrF, jnrG, jnrH;
475 int j3E, j3F, j3G, j3H;
477 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
478 float sum_ai2, sum_ai3, tsum, tchain, doffset;
487 __m128 ix, iy, iz, jx, jy, jz;
488 __m128 dx, dy, dz, t1, t2, t3, t4;
490 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
491 __m128 uij, lij2, uij2, lij3, uij3, diff2;
492 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
493 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
497 __m128 obc_mask1, obc_mask2, obc_mask3;
498 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
499 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
500 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
501 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
502 __m128 lij_invB, sk2_invB, prodB;
503 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
504 __m128 dadx1B, dadx2B;
506 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
508 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
509 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
510 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
512 __m128 oneeighth = _mm_set1_ps(0.125);
513 __m128 onefourth = _mm_set1_ps(0.25);
515 const __m128 half = _mm_set1_ps(0.5f);
516 const __m128 three = _mm_set1_ps(3.0f);
517 const __m128 one = _mm_set1_ps(1.0f);
518 const __m128 two = _mm_set1_ps(2.0f);
519 const __m128 zero = _mm_set1_ps(0.0f);
520 const __m128 neg = _mm_set1_ps(-1.0f);
522 /* Set the dielectric offset */
523 doffset = born->gb_doffset;
524 gb_radius = born->gb_radius;
525 obc_param = born->param;
526 work = born->gpol_hct_work;
529 shiftvec = fr->shift_vec[0];
531 jx = _mm_setzero_ps();
532 jy = _mm_setzero_ps();
533 jz = _mm_setzero_ps();
535 jnrA = jnrB = jnrC = jnrD = 0;
537 for (i = 0; i < born->nr; i++)
542 for (i = 0; i < nl->nri; i++)
546 is3 = 3*nl->shift[i];
548 shY = shiftvec[is3+1];
549 shZ = shiftvec[is3+2];
551 nj1 = nl->jindex[i+1];
553 ix = _mm_set1_ps(shX+x[ii3+0]);
554 iy = _mm_set1_ps(shY+x[ii3+1]);
555 iz = _mm_set1_ps(shZ+x[ii3+2]);
557 offset = (nj1-nj0)%4;
559 rai = _mm_load1_ps(gb_radius+ii);
560 rai_inv = gmx_mm_inv_ps(rai);
562 sum_ai = _mm_setzero_ps();
564 sk_ai = _mm_load1_ps(born->param+ii);
565 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
567 for (k = nj0; k < nj1-4-offset; k += 8)
587 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
588 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
589 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
590 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
591 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
592 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
594 dx = _mm_sub_ps(ix, jx);
595 dy = _mm_sub_ps(iy, jy);
596 dz = _mm_sub_ps(iz, jz);
597 dxB = _mm_sub_ps(ix, jxB);
598 dyB = _mm_sub_ps(iy, jyB);
599 dzB = _mm_sub_ps(iz, jzB);
601 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
602 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
604 rinv = gmx_mm_invsqrt_ps(rsq);
605 r = _mm_mul_ps(rsq, rinv);
606 rinvB = gmx_mm_invsqrt_ps(rsqB);
607 rB = _mm_mul_ps(rsqB, rinvB);
609 /* Compute raj_inv aj1-4 */
610 raj_inv = gmx_mm_inv_ps(raj);
611 raj_invB = gmx_mm_inv_ps(rajB);
613 /* Evaluate influence of atom aj -> ai */
614 t1 = _mm_add_ps(r, sk_aj);
615 t2 = _mm_sub_ps(r, sk_aj);
616 t3 = _mm_sub_ps(sk_aj, r);
617 t1B = _mm_add_ps(rB, sk_ajB);
618 t2B = _mm_sub_ps(rB, sk_ajB);
619 t3B = _mm_sub_ps(sk_ajB, rB);
620 obc_mask1 = _mm_cmplt_ps(rai, t1);
621 obc_mask2 = _mm_cmplt_ps(rai, t2);
622 obc_mask3 = _mm_cmplt_ps(rai, t3);
623 obc_mask1B = _mm_cmplt_ps(rai, t1B);
624 obc_mask2B = _mm_cmplt_ps(rai, t2B);
625 obc_mask3B = _mm_cmplt_ps(rai, t3B);
627 uij = gmx_mm_inv_ps(t1);
628 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
629 _mm_andnot_ps(obc_mask2, rai_inv));
630 dlij = _mm_and_ps(one, obc_mask2);
631 uij2 = _mm_mul_ps(uij, uij);
632 uij3 = _mm_mul_ps(uij2, uij);
633 lij2 = _mm_mul_ps(lij, lij);
634 lij3 = _mm_mul_ps(lij2, lij);
636 uijB = gmx_mm_inv_ps(t1B);
637 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
638 _mm_andnot_ps(obc_mask2B, rai_inv));
639 dlijB = _mm_and_ps(one, obc_mask2B);
640 uij2B = _mm_mul_ps(uijB, uijB);
641 uij3B = _mm_mul_ps(uij2B, uijB);
642 lij2B = _mm_mul_ps(lijB, lijB);
643 lij3B = _mm_mul_ps(lij2B, lijB);
645 diff2 = _mm_sub_ps(uij2, lij2);
646 lij_inv = gmx_mm_invsqrt_ps(lij2);
647 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
648 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
649 prod = _mm_mul_ps(onefourth, sk2_rinv);
651 diff2B = _mm_sub_ps(uij2B, lij2B);
652 lij_invB = gmx_mm_invsqrt_ps(lij2B);
653 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
654 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
655 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
657 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
658 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
660 t1 = _mm_sub_ps(lij, uij);
661 t2 = _mm_mul_ps(diff2,
662 _mm_sub_ps(_mm_mul_ps(onefourth, r),
664 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
665 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
666 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
667 t4 = _mm_and_ps(t4, obc_mask3);
668 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
670 t1B = _mm_sub_ps(lijB, uijB);
671 t2B = _mm_mul_ps(diff2B,
672 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
674 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
675 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
676 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
677 t4B = _mm_and_ps(t4B, obc_mask3B);
678 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
680 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
682 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
683 _mm_mul_ps(prod, lij3));
685 _mm_mul_ps(onefourth,
686 _mm_add_ps(_mm_mul_ps(lij, rinv),
687 _mm_mul_ps(lij3, r))));
688 t2 = _mm_mul_ps(onefourth,
689 _mm_add_ps(_mm_mul_ps(uij, rinv),
690 _mm_mul_ps(uij3, r)));
692 _mm_add_ps(_mm_mul_ps(half, uij2),
693 _mm_mul_ps(prod, uij3)));
694 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
695 _mm_mul_ps(rinv, rinv));
697 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
699 _mm_mul_ps(sk2_rinv, rinv))));
700 t1 = _mm_mul_ps(rinv,
701 _mm_add_ps(_mm_mul_ps(dlij, t1),
702 _mm_add_ps(t2, t3)));
706 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
707 _mm_mul_ps(prodB, lij3B));
708 t1B = _mm_sub_ps(t1B,
709 _mm_mul_ps(onefourth,
710 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
711 _mm_mul_ps(lij3B, rB))));
712 t2B = _mm_mul_ps(onefourth,
713 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
714 _mm_mul_ps(uij3B, rB)));
715 t2B = _mm_sub_ps(t2B,
716 _mm_add_ps(_mm_mul_ps(half, uij2B),
717 _mm_mul_ps(prodB, uij3B)));
718 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
719 _mm_mul_ps(rinvB, rinvB));
720 t3B = _mm_sub_ps(t3B,
721 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
723 _mm_mul_ps(sk2_rinvB, rinvB))));
724 t1B = _mm_mul_ps(rinvB,
725 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
726 _mm_add_ps(t2B, t3B)));
728 dadx1 = _mm_and_ps(t1, obc_mask1);
729 dadx1B = _mm_and_ps(t1B, obc_mask1B);
732 /* Evaluate influence of atom ai -> aj */
733 t1 = _mm_add_ps(r, sk_ai);
734 t2 = _mm_sub_ps(r, sk_ai);
735 t3 = _mm_sub_ps(sk_ai, r);
736 t1B = _mm_add_ps(rB, sk_ai);
737 t2B = _mm_sub_ps(rB, sk_ai);
738 t3B = _mm_sub_ps(sk_ai, rB);
739 obc_mask1 = _mm_cmplt_ps(raj, t1);
740 obc_mask2 = _mm_cmplt_ps(raj, t2);
741 obc_mask3 = _mm_cmplt_ps(raj, t3);
742 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
743 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
744 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
746 uij = gmx_mm_inv_ps(t1);
747 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
748 _mm_andnot_ps(obc_mask2, raj_inv));
749 dlij = _mm_and_ps(one, obc_mask2);
750 uij2 = _mm_mul_ps(uij, uij);
751 uij3 = _mm_mul_ps(uij2, uij);
752 lij2 = _mm_mul_ps(lij, lij);
753 lij3 = _mm_mul_ps(lij2, lij);
755 uijB = gmx_mm_inv_ps(t1B);
756 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
757 _mm_andnot_ps(obc_mask2B, raj_invB));
758 dlijB = _mm_and_ps(one, obc_mask2B);
759 uij2B = _mm_mul_ps(uijB, uijB);
760 uij3B = _mm_mul_ps(uij2B, uijB);
761 lij2B = _mm_mul_ps(lijB, lijB);
762 lij3B = _mm_mul_ps(lij2B, lijB);
764 diff2 = _mm_sub_ps(uij2, lij2);
765 lij_inv = gmx_mm_invsqrt_ps(lij2);
766 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
767 prod = _mm_mul_ps(onefourth, sk2_rinv);
769 diff2B = _mm_sub_ps(uij2B, lij2B);
770 lij_invB = gmx_mm_invsqrt_ps(lij2B);
771 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
772 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
774 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
775 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
777 t1 = _mm_sub_ps(lij, uij);
778 t2 = _mm_mul_ps(diff2,
779 _mm_sub_ps(_mm_mul_ps(onefourth, r),
781 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
782 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
783 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
784 t4 = _mm_and_ps(t4, obc_mask3);
785 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
787 t1B = _mm_sub_ps(lijB, uijB);
788 t2B = _mm_mul_ps(diff2B,
789 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
791 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
792 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
793 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
794 t4B = _mm_and_ps(t4B, obc_mask3B);
795 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
797 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
798 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
800 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
801 _mm_mul_ps(prod, lij3));
803 _mm_mul_ps(onefourth,
804 _mm_add_ps(_mm_mul_ps(lij, rinv),
805 _mm_mul_ps(lij3, r))));
806 t2 = _mm_mul_ps(onefourth,
807 _mm_add_ps(_mm_mul_ps(uij, rinv),
808 _mm_mul_ps(uij3, r)));
810 _mm_add_ps(_mm_mul_ps(half, uij2),
811 _mm_mul_ps(prod, uij3)));
812 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
813 _mm_mul_ps(rinv, rinv));
815 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
817 _mm_mul_ps(sk2_rinv, rinv))));
818 t1 = _mm_mul_ps(rinv,
819 _mm_add_ps(_mm_mul_ps(dlij, t1),
820 _mm_add_ps(t2, t3)));
823 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
824 _mm_mul_ps(prodB, lij3B));
825 t1B = _mm_sub_ps(t1B,
826 _mm_mul_ps(onefourth,
827 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
828 _mm_mul_ps(lij3B, rB))));
829 t2B = _mm_mul_ps(onefourth,
830 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
831 _mm_mul_ps(uij3B, rB)));
832 t2B = _mm_sub_ps(t2B,
833 _mm_add_ps(_mm_mul_ps(half, uij2B),
834 _mm_mul_ps(prodB, uij3B)));
835 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
836 _mm_mul_ps(rinvB, rinvB));
837 t3B = _mm_sub_ps(t3B,
838 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
840 _mm_mul_ps(sk2_rinvB, rinvB))));
841 t1B = _mm_mul_ps(rinvB,
842 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
843 _mm_add_ps(t2B, t3B)));
846 dadx2 = _mm_and_ps(t1, obc_mask1);
847 dadx2B = _mm_and_ps(t1B, obc_mask1B);
849 _mm_store_ps(dadx, dadx1);
851 _mm_store_ps(dadx, dadx2);
853 _mm_store_ps(dadx, dadx1B);
855 _mm_store_ps(dadx, dadx2B);
858 } /* end normal inner loop */
860 for (; k < nj1-offset; k += 4)
872 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
873 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
874 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
876 dx = _mm_sub_ps(ix, jx);
877 dy = _mm_sub_ps(iy, jy);
878 dz = _mm_sub_ps(iz, jz);
880 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
882 rinv = gmx_mm_invsqrt_ps(rsq);
883 r = _mm_mul_ps(rsq, rinv);
885 /* Compute raj_inv aj1-4 */
886 raj_inv = gmx_mm_inv_ps(raj);
888 /* Evaluate influence of atom aj -> ai */
889 t1 = _mm_add_ps(r, sk_aj);
890 obc_mask1 = _mm_cmplt_ps(rai, t1);
892 if (_mm_movemask_ps(obc_mask1))
894 /* If any of the elements has rai<dr+sk, this is executed */
895 t2 = _mm_sub_ps(r, sk_aj);
896 t3 = _mm_sub_ps(sk_aj, r);
898 obc_mask2 = _mm_cmplt_ps(rai, t2);
899 obc_mask3 = _mm_cmplt_ps(rai, t3);
901 uij = gmx_mm_inv_ps(t1);
902 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
903 _mm_andnot_ps(obc_mask2, rai_inv));
904 dlij = _mm_and_ps(one, obc_mask2);
905 uij2 = _mm_mul_ps(uij, uij);
906 uij3 = _mm_mul_ps(uij2, uij);
907 lij2 = _mm_mul_ps(lij, lij);
908 lij3 = _mm_mul_ps(lij2, lij);
909 diff2 = _mm_sub_ps(uij2, lij2);
910 lij_inv = gmx_mm_invsqrt_ps(lij2);
911 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
912 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
913 prod = _mm_mul_ps(onefourth, sk2_rinv);
914 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
915 t1 = _mm_sub_ps(lij, uij);
916 t2 = _mm_mul_ps(diff2,
917 _mm_sub_ps(_mm_mul_ps(onefourth, r),
919 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
920 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
921 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
922 t4 = _mm_and_ps(t4, obc_mask3);
923 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
924 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
925 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
926 _mm_mul_ps(prod, lij3));
928 _mm_mul_ps(onefourth,
929 _mm_add_ps(_mm_mul_ps(lij, rinv),
930 _mm_mul_ps(lij3, r))));
931 t2 = _mm_mul_ps(onefourth,
932 _mm_add_ps(_mm_mul_ps(uij, rinv),
933 _mm_mul_ps(uij3, r)));
935 _mm_add_ps(_mm_mul_ps(half, uij2),
936 _mm_mul_ps(prod, uij3)));
937 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
938 _mm_mul_ps(rinv, rinv));
940 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
942 _mm_mul_ps(sk2_rinv, rinv))));
943 t1 = _mm_mul_ps(rinv,
944 _mm_add_ps(_mm_mul_ps(dlij, t1),
945 _mm_add_ps(t2, t3)));
947 dadx1 = _mm_and_ps(t1, obc_mask1);
951 dadx1 = _mm_setzero_ps();
954 /* Evaluate influence of atom ai -> aj */
955 t1 = _mm_add_ps(r, sk_ai);
956 obc_mask1 = _mm_cmplt_ps(raj, t1);
958 if (_mm_movemask_ps(obc_mask1))
960 t2 = _mm_sub_ps(r, sk_ai);
961 t3 = _mm_sub_ps(sk_ai, r);
962 obc_mask2 = _mm_cmplt_ps(raj, t2);
963 obc_mask3 = _mm_cmplt_ps(raj, t3);
965 uij = gmx_mm_inv_ps(t1);
966 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
967 _mm_andnot_ps(obc_mask2, raj_inv));
968 dlij = _mm_and_ps(one, obc_mask2);
969 uij2 = _mm_mul_ps(uij, uij);
970 uij3 = _mm_mul_ps(uij2, uij);
971 lij2 = _mm_mul_ps(lij, lij);
972 lij3 = _mm_mul_ps(lij2, lij);
973 diff2 = _mm_sub_ps(uij2, lij2);
974 lij_inv = gmx_mm_invsqrt_ps(lij2);
975 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
976 prod = _mm_mul_ps(onefourth, sk2_rinv);
977 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
978 t1 = _mm_sub_ps(lij, uij);
979 t2 = _mm_mul_ps(diff2,
980 _mm_sub_ps(_mm_mul_ps(onefourth, r),
982 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
983 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
984 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
985 t4 = _mm_and_ps(t4, obc_mask3);
986 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
988 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
990 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
991 _mm_mul_ps(prod, lij3));
993 _mm_mul_ps(onefourth,
994 _mm_add_ps(_mm_mul_ps(lij, rinv),
995 _mm_mul_ps(lij3, r))));
996 t2 = _mm_mul_ps(onefourth,
997 _mm_add_ps(_mm_mul_ps(uij, rinv),
998 _mm_mul_ps(uij3, r)));
1000 _mm_add_ps(_mm_mul_ps(half, uij2),
1001 _mm_mul_ps(prod, uij3)));
1002 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1003 _mm_mul_ps(rinv, rinv));
1005 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1007 _mm_mul_ps(sk2_rinv, rinv))));
1008 t1 = _mm_mul_ps(rinv,
1009 _mm_add_ps(_mm_mul_ps(dlij, t1),
1010 _mm_add_ps(t2, t3)));
1011 dadx2 = _mm_and_ps(t1, obc_mask1);
1015 dadx2 = _mm_setzero_ps();
1018 _mm_store_ps(dadx, dadx1);
1020 _mm_store_ps(dadx, dadx2);
1022 } /* end normal inner loop */
1030 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1031 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1032 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1035 else if (offset == 2)
1041 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1042 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1043 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1048 /* offset must be 3 */
1055 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1056 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1057 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1061 dx = _mm_sub_ps(ix, jx);
1062 dy = _mm_sub_ps(iy, jy);
1063 dz = _mm_sub_ps(iz, jz);
1065 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1067 rinv = gmx_mm_invsqrt_ps(rsq);
1068 r = _mm_mul_ps(rsq, rinv);
1070 /* Compute raj_inv aj1-4 */
1071 raj_inv = gmx_mm_inv_ps(raj);
1073 /* Evaluate influence of atom aj -> ai */
1074 t1 = _mm_add_ps(r, sk_aj);
1075 obc_mask1 = _mm_cmplt_ps(rai, t1);
1076 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1078 if (_mm_movemask_ps(obc_mask1))
1080 t2 = _mm_sub_ps(r, sk_aj);
1081 t3 = _mm_sub_ps(sk_aj, r);
1082 obc_mask2 = _mm_cmplt_ps(rai, t2);
1083 obc_mask3 = _mm_cmplt_ps(rai, t3);
1085 uij = gmx_mm_inv_ps(t1);
1086 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1087 _mm_andnot_ps(obc_mask2, rai_inv));
1088 dlij = _mm_and_ps(one, obc_mask2);
1089 uij2 = _mm_mul_ps(uij, uij);
1090 uij3 = _mm_mul_ps(uij2, uij);
1091 lij2 = _mm_mul_ps(lij, lij);
1092 lij3 = _mm_mul_ps(lij2, lij);
1093 diff2 = _mm_sub_ps(uij2, lij2);
1094 lij_inv = gmx_mm_invsqrt_ps(lij2);
1095 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1096 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1097 prod = _mm_mul_ps(onefourth, sk2_rinv);
1098 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1099 t1 = _mm_sub_ps(lij, uij);
1100 t2 = _mm_mul_ps(diff2,
1101 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1103 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1104 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1105 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1106 t4 = _mm_and_ps(t4, obc_mask3);
1107 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1108 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1109 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1110 _mm_mul_ps(prod, lij3));
1112 _mm_mul_ps(onefourth,
1113 _mm_add_ps(_mm_mul_ps(lij, rinv),
1114 _mm_mul_ps(lij3, r))));
1115 t2 = _mm_mul_ps(onefourth,
1116 _mm_add_ps(_mm_mul_ps(uij, rinv),
1117 _mm_mul_ps(uij3, r)));
1119 _mm_add_ps(_mm_mul_ps(half, uij2),
1120 _mm_mul_ps(prod, uij3)));
1121 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1122 _mm_mul_ps(rinv, rinv));
1124 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1126 _mm_mul_ps(sk2_rinv, rinv))));
1127 t1 = _mm_mul_ps(rinv,
1128 _mm_add_ps(_mm_mul_ps(dlij, t1),
1129 _mm_add_ps(t2, t3)));
1130 dadx1 = _mm_and_ps(t1, obc_mask1);
1134 dadx1 = _mm_setzero_ps();
1137 /* Evaluate influence of atom ai -> aj */
1138 t1 = _mm_add_ps(r, sk_ai);
1139 obc_mask1 = _mm_cmplt_ps(raj, t1);
1140 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1142 if (_mm_movemask_ps(obc_mask1))
1144 t2 = _mm_sub_ps(r, sk_ai);
1145 t3 = _mm_sub_ps(sk_ai, r);
1146 obc_mask2 = _mm_cmplt_ps(raj, t2);
1147 obc_mask3 = _mm_cmplt_ps(raj, t3);
1149 uij = gmx_mm_inv_ps(t1);
1150 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1151 _mm_andnot_ps(obc_mask2, raj_inv));
1152 dlij = _mm_and_ps(one, obc_mask2);
1153 uij2 = _mm_mul_ps(uij, uij);
1154 uij3 = _mm_mul_ps(uij2, uij);
1155 lij2 = _mm_mul_ps(lij, lij);
1156 lij3 = _mm_mul_ps(lij2, lij);
1157 diff2 = _mm_sub_ps(uij2, lij2);
1158 lij_inv = gmx_mm_invsqrt_ps(lij2);
1159 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1160 prod = _mm_mul_ps(onefourth, sk2_rinv);
1161 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1162 t1 = _mm_sub_ps(lij, uij);
1163 t2 = _mm_mul_ps(diff2,
1164 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1166 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1167 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1168 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1169 t4 = _mm_and_ps(t4, obc_mask3);
1170 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1172 tmp = _mm_and_ps(t1, obc_mask1);
1174 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1175 _mm_mul_ps(prod, lij3));
1177 _mm_mul_ps(onefourth,
1178 _mm_add_ps(_mm_mul_ps(lij, rinv),
1179 _mm_mul_ps(lij3, r))));
1180 t2 = _mm_mul_ps(onefourth,
1181 _mm_add_ps(_mm_mul_ps(uij, rinv),
1182 _mm_mul_ps(uij3, r)));
1184 _mm_add_ps(_mm_mul_ps(half, uij2),
1185 _mm_mul_ps(prod, uij3)));
1186 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1187 _mm_mul_ps(rinv, rinv));
1189 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1191 _mm_mul_ps(sk2_rinv, rinv))));
1192 t1 = _mm_mul_ps(rinv,
1193 _mm_add_ps(_mm_mul_ps(dlij, t1),
1194 _mm_add_ps(t2, t3)));
1195 dadx2 = _mm_and_ps(t1, obc_mask1);
1199 dadx2 = _mm_setzero_ps();
1200 tmp = _mm_setzero_ps();
1203 _mm_store_ps(dadx, dadx1);
1205 _mm_store_ps(dadx, dadx2);
1210 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1212 else if (offset == 2)
1214 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1218 /* offset must be 3 */
1219 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1223 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1227 /* Parallel summations */
1228 if (DOMAINDECOMP(cr))
1230 dd_atom_sum_real(cr->dd, work);
1233 if (gb_algorithm == egbHCT)
1236 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1238 if (born->use[i] != 0)
1240 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1241 sum = 1.0/rr - work[i];
1242 min_rad = rr + doffset;
1245 born->bRad[i] = rad > min_rad ? rad : min_rad;
1246 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1250 /* Extra communication required for DD */
1251 if (DOMAINDECOMP(cr))
1253 dd_atom_spread_real(cr->dd, born->bRad);
1254 dd_atom_spread_real(cr->dd, fr->invsqrta);
1260 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1262 if (born->use[i] != 0)
1264 rr = top->atomtypes.gb_radius[md->typeA[i]];
1272 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1273 born->bRad[i] = rr_inv - tsum*rr_inv2;
1274 born->bRad[i] = 1.0 / born->bRad[i];
1276 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1278 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1279 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1282 /* Extra (local) communication required for DD */
1283 if (DOMAINDECOMP(cr))
1285 dd_atom_spread_real(cr->dd, born->bRad);
1286 dd_atom_spread_real(cr->dd, fr->invsqrta);
1287 dd_atom_spread_real(cr->dd, born->drobc);
1298 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1299 float *x, float *f, float *fshift, float *shiftvec,
1300 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1302 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1303 int jnrA, jnrB, jnrC, jnrD;
1304 int j3A, j3B, j3C, j3D;
1305 int jnrE, jnrF, jnrG, jnrH;
1306 int j3E, j3F, j3G, j3H;
1309 float rbi, shX, shY, shZ;
1314 __m128 jxB, jyB, jzB;
1315 __m128 fix, fiy, fiz;
1318 __m128 dxB, dyB, dzB;
1319 __m128 txB, tyB, tzB;
1321 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1322 __m128 xmm1, xmm2, xmm3;
1324 const __m128 two = _mm_set1_ps(2.0f);
1330 /* Loop to get the proper form for the Born radius term, sse style */
1336 if (gb_algorithm == egbSTILL)
1338 for (i = n0; i < n1; i++)
1340 rbi = born->bRad[i];
1341 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1344 else if (gb_algorithm == egbHCT)
1346 for (i = n0; i < n1; i++)
1348 rbi = born->bRad[i];
1349 rb[i] = rbi * rbi * dvda[i];
1352 else if (gb_algorithm == egbOBC)
1354 for (i = n0; i < n1; i++)
1356 rbi = born->bRad[i];
1357 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1361 jz = _mm_setzero_ps();
1363 n = j3A = j3B = j3C = j3D = 0;
1365 for (i = 0; i < nl->nri; i++)
1369 is3 = 3*nl->shift[i];
1370 shX = shiftvec[is3];
1371 shY = shiftvec[is3+1];
1372 shZ = shiftvec[is3+2];
1373 nj0 = nl->jindex[i];
1374 nj1 = nl->jindex[i+1];
1376 ix = _mm_set1_ps(shX+x[ii3+0]);
1377 iy = _mm_set1_ps(shY+x[ii3+1]);
1378 iz = _mm_set1_ps(shZ+x[ii3+2]);
1380 offset = (nj1-nj0)%4;
1382 rbai = _mm_load1_ps(rb+ii);
1383 fix = _mm_setzero_ps();
1384 fiy = _mm_setzero_ps();
1385 fiz = _mm_setzero_ps();
1388 for (k = nj0; k < nj1-offset; k += 4)
1400 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1402 dx = _mm_sub_ps(ix, jx);
1403 dy = _mm_sub_ps(iy, jy);
1404 dz = _mm_sub_ps(iz, jz);
1406 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1408 /* load chain rule terms for j1-4 */
1409 f_gb = _mm_load_ps(dadx);
1411 f_gb_ai = _mm_load_ps(dadx);
1414 /* calculate scalar force */
1415 f_gb = _mm_mul_ps(f_gb, rbai);
1416 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1417 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1419 tx = _mm_mul_ps(f_gb, dx);
1420 ty = _mm_mul_ps(f_gb, dy);
1421 tz = _mm_mul_ps(f_gb, dz);
1423 fix = _mm_add_ps(fix, tx);
1424 fiy = _mm_add_ps(fiy, ty);
1425 fiz = _mm_add_ps(fiz, tz);
1427 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1430 /*deal with odd elements */
1437 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1438 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1440 else if (offset == 2)
1446 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1447 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1451 /* offset must be 3 */
1458 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1459 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1462 dx = _mm_sub_ps(ix, jx);
1463 dy = _mm_sub_ps(iy, jy);
1464 dz = _mm_sub_ps(iz, jz);
1466 /* load chain rule terms for j1-4 */
1467 f_gb = _mm_load_ps(dadx);
1469 f_gb_ai = _mm_load_ps(dadx);
1472 /* calculate scalar force */
1473 f_gb = _mm_mul_ps(f_gb, rbai);
1474 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1475 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1477 tx = _mm_mul_ps(f_gb, dx);
1478 ty = _mm_mul_ps(f_gb, dy);
1479 tz = _mm_mul_ps(f_gb, dz);
1481 fix = _mm_add_ps(fix, tx);
1482 fiy = _mm_add_ps(fiy, ty);
1483 fiz = _mm_add_ps(fiz, tz);
1487 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1489 else if (offset == 2)
1491 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1495 /* offset must be 3 */
1496 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1500 /* fix/fiy/fiz now contain four partial force terms, that all should be
1501 * added to the i particle forces and shift forces.
1503 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1511 /* keep compiler happy */
1512 int genborn_sse_dummy;
1514 #endif /* SSE intrinsics available */