2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/legacyheaders/typedefs.h"
43 #include "gromacs/utility/smalloc.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/fileio/pdbio.h"
47 #include "gromacs/legacyheaders/names.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/domdec.h"
50 #include "gromacs/legacyheaders/network.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/legacyheaders/genborn.h"
54 #include "gromacs/utility/gmxmpi.h"
57 /* Only compile this file if SSE intrinsics are available */
58 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
60 #include <gmx_sse2_single.h>
61 #include <emmintrin.h>
63 #include "genborn_sse2_single.h"
67 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
68 int natoms, gmx_localtop_t *top,
69 float *x, t_nblist *nl,
72 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
73 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
74 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
95 __m128 rsq, rinv, rinv2, rinv4, rinv6;
96 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
97 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
98 __m128 ratioB, rajB, vajB, rvdwB;
99 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
100 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
101 __m128 mask, icf4, icf6, mask_cmp;
102 __m128 icf4B, icf6B, mask_cmpB;
104 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
105 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
106 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
108 const __m128 half = _mm_set1_ps(0.5f);
109 const __m128 three = _mm_set1_ps(3.0f);
110 const __m128 one = _mm_set1_ps(1.0f);
111 const __m128 two = _mm_set1_ps(2.0f);
112 const __m128 zero = _mm_set1_ps(0.0f);
113 const __m128 four = _mm_set1_ps(4.0f);
115 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
116 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
117 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
119 factor = 0.5 * ONE_4PI_EPS0;
121 gb_radius = born->gb_radius;
123 work = born->gpol_still_work;
125 shiftvec = fr->shift_vec[0];
128 jnrA = jnrB = jnrC = jnrD = 0;
129 jx = _mm_setzero_ps();
130 jy = _mm_setzero_ps();
131 jz = _mm_setzero_ps();
135 for (i = 0; i < natoms; i++)
140 for (i = 0; i < nl->nri; i++)
144 is3 = 3*nl->shift[i];
146 shY = shiftvec[is3+1];
147 shZ = shiftvec[is3+2];
149 nj1 = nl->jindex[i+1];
151 ix = _mm_set1_ps(shX+x[ii3+0]);
152 iy = _mm_set1_ps(shY+x[ii3+1]);
153 iz = _mm_set1_ps(shZ+x[ii3+2]);
155 offset = (nj1-nj0)%4;
157 /* Polarization energy for atom ai */
158 gpi = _mm_setzero_ps();
160 rai = _mm_load1_ps(gb_radius+ii);
161 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
163 for (k = nj0; k < nj1-4-offset; k += 8)
183 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
184 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
186 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
187 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
188 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
189 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
191 dx = _mm_sub_ps(ix, jx);
192 dy = _mm_sub_ps(iy, jy);
193 dz = _mm_sub_ps(iz, jz);
194 dxB = _mm_sub_ps(ix, jxB);
195 dyB = _mm_sub_ps(iy, jyB);
196 dzB = _mm_sub_ps(iz, jzB);
198 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
199 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
200 rinv = gmx_mm_invsqrt_ps(rsq);
201 rinvB = gmx_mm_invsqrt_ps(rsqB);
202 rinv2 = _mm_mul_ps(rinv, rinv);
203 rinv2B = _mm_mul_ps(rinvB, rinvB);
204 rinv4 = _mm_mul_ps(rinv2, rinv2);
205 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
206 rinv6 = _mm_mul_ps(rinv4, rinv2);
207 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
209 rvdw = _mm_add_ps(rai, raj);
210 rvdwB = _mm_add_ps(rai, rajB);
211 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
212 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
214 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
215 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
217 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
218 if (0 == _mm_movemask_ps(mask_cmp) )
220 /* if ratio>still_p5inv for ALL elements */
222 dccf = _mm_setzero_ps();
226 ratio = _mm_min_ps(ratio, still_p5inv);
227 theta = _mm_mul_ps(ratio, still_pip5);
228 gmx_mm_sincos_ps(theta, &sinq, &cosq);
229 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
230 ccf = _mm_mul_ps(term, term);
231 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
232 _mm_mul_ps(sinq, theta));
234 if (0 == _mm_movemask_ps(mask_cmpB) )
236 /* if ratio>still_p5inv for ALL elements */
238 dccfB = _mm_setzero_ps();
242 ratioB = _mm_min_ps(ratioB, still_p5inv);
243 thetaB = _mm_mul_ps(ratioB, still_pip5);
244 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
245 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
246 ccfB = _mm_mul_ps(termB, termB);
247 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
248 _mm_mul_ps(sinqB, thetaB));
251 prod = _mm_mul_ps(still_p4, vaj);
252 prodB = _mm_mul_ps(still_p4, vajB);
253 icf4 = _mm_mul_ps(ccf, rinv4);
254 icf4B = _mm_mul_ps(ccfB, rinv4B);
255 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
256 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
258 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
259 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
261 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
263 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
265 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
267 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
269 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
273 for (; k < nj1-offset; k += 4)
285 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
287 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
288 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
290 dx = _mm_sub_ps(ix, jx);
291 dy = _mm_sub_ps(iy, jy);
292 dz = _mm_sub_ps(iz, jz);
294 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
295 rinv = gmx_mm_invsqrt_ps(rsq);
296 rinv2 = _mm_mul_ps(rinv, rinv);
297 rinv4 = _mm_mul_ps(rinv2, rinv2);
298 rinv6 = _mm_mul_ps(rinv4, rinv2);
300 rvdw = _mm_add_ps(rai, raj);
301 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
303 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
305 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
306 if (0 == _mm_movemask_ps(mask_cmp))
308 /* if ratio>still_p5inv for ALL elements */
310 dccf = _mm_setzero_ps();
314 ratio = _mm_min_ps(ratio, still_p5inv);
315 theta = _mm_mul_ps(ratio, still_pip5);
316 gmx_mm_sincos_ps(theta, &sinq, &cosq);
317 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
318 ccf = _mm_mul_ps(term, term);
319 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
320 _mm_mul_ps(sinq, theta));
323 prod = _mm_mul_ps(still_p4, vaj);
324 icf4 = _mm_mul_ps(ccf, rinv4);
325 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
327 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
329 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
331 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
333 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
343 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
344 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
345 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
348 else if (offset == 2)
354 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
355 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
356 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
361 /* offset must be 3 */
368 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
369 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
370 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
374 dx = _mm_sub_ps(ix, jx);
375 dy = _mm_sub_ps(iy, jy);
376 dz = _mm_sub_ps(iz, jz);
378 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
379 rinv = gmx_mm_invsqrt_ps(rsq);
380 rinv2 = _mm_mul_ps(rinv, rinv);
381 rinv4 = _mm_mul_ps(rinv2, rinv2);
382 rinv6 = _mm_mul_ps(rinv4, rinv2);
384 rvdw = _mm_add_ps(rai, raj);
385 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
387 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
389 if (0 == _mm_movemask_ps(mask_cmp))
391 /* if ratio>still_p5inv for ALL elements */
393 dccf = _mm_setzero_ps();
397 ratio = _mm_min_ps(ratio, still_p5inv);
398 theta = _mm_mul_ps(ratio, still_pip5);
399 gmx_mm_sincos_ps(theta, &sinq, &cosq);
400 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
401 ccf = _mm_mul_ps(term, term);
402 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
403 _mm_mul_ps(sinq, theta));
406 prod = _mm_mul_ps(still_p4, vaj);
407 icf4 = _mm_mul_ps(ccf, rinv4);
408 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
410 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
412 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
414 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
417 tmp = _mm_mul_ps(prod_ai, icf4);
421 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
423 else if (offset == 2)
425 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
429 /* offset must be 3 */
430 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
433 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
436 /* Sum up the polarization energy from other nodes */
437 if (DOMAINDECOMP(cr))
439 dd_atom_sum_real(cr->dd, work);
442 /* Compute the radii */
443 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
445 if (born->use[i] != 0)
447 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
448 gpi2 = gpi_ai * gpi_ai;
449 born->bRad[i] = factor*gmx_invsqrt(gpi2);
450 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
454 /* Extra (local) communication required for DD */
455 if (DOMAINDECOMP(cr))
457 dd_atom_spread_real(cr->dd, born->bRad);
458 dd_atom_spread_real(cr->dd, fr->invsqrta);
466 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
467 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
469 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
470 int jnrA, jnrB, jnrC, jnrD;
471 int j3A, j3B, j3C, j3D;
472 int jnrE, jnrF, jnrG, jnrH;
473 int j3E, j3F, j3G, j3H;
475 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
476 float sum_ai2, sum_ai3, tsum, tchain, doffset;
485 __m128 ix, iy, iz, jx, jy, jz;
486 __m128 dx, dy, dz, t1, t2, t3, t4;
488 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
489 __m128 uij, lij2, uij2, lij3, uij3, diff2;
490 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
491 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
495 __m128 obc_mask1, obc_mask2, obc_mask3;
496 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
497 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
498 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
499 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
500 __m128 lij_invB, sk2_invB, prodB;
501 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
502 __m128 dadx1B, dadx2B;
504 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
506 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
507 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
508 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
510 __m128 oneeighth = _mm_set1_ps(0.125);
511 __m128 onefourth = _mm_set1_ps(0.25);
513 const __m128 half = _mm_set1_ps(0.5f);
514 const __m128 three = _mm_set1_ps(3.0f);
515 const __m128 one = _mm_set1_ps(1.0f);
516 const __m128 two = _mm_set1_ps(2.0f);
517 const __m128 zero = _mm_set1_ps(0.0f);
518 const __m128 neg = _mm_set1_ps(-1.0f);
520 /* Set the dielectric offset */
521 doffset = born->gb_doffset;
522 gb_radius = born->gb_radius;
523 obc_param = born->param;
524 work = born->gpol_hct_work;
527 shiftvec = fr->shift_vec[0];
529 jx = _mm_setzero_ps();
530 jy = _mm_setzero_ps();
531 jz = _mm_setzero_ps();
533 jnrA = jnrB = jnrC = jnrD = 0;
535 for (i = 0; i < born->nr; i++)
540 for (i = 0; i < nl->nri; i++)
544 is3 = 3*nl->shift[i];
546 shY = shiftvec[is3+1];
547 shZ = shiftvec[is3+2];
549 nj1 = nl->jindex[i+1];
551 ix = _mm_set1_ps(shX+x[ii3+0]);
552 iy = _mm_set1_ps(shY+x[ii3+1]);
553 iz = _mm_set1_ps(shZ+x[ii3+2]);
555 offset = (nj1-nj0)%4;
557 rai = _mm_load1_ps(gb_radius+ii);
558 rai_inv = gmx_mm_inv_ps(rai);
560 sum_ai = _mm_setzero_ps();
562 sk_ai = _mm_load1_ps(born->param+ii);
563 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
565 for (k = nj0; k < nj1-4-offset; k += 8)
585 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
586 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
587 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
588 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
589 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
590 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
592 dx = _mm_sub_ps(ix, jx);
593 dy = _mm_sub_ps(iy, jy);
594 dz = _mm_sub_ps(iz, jz);
595 dxB = _mm_sub_ps(ix, jxB);
596 dyB = _mm_sub_ps(iy, jyB);
597 dzB = _mm_sub_ps(iz, jzB);
599 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
600 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
602 rinv = gmx_mm_invsqrt_ps(rsq);
603 r = _mm_mul_ps(rsq, rinv);
604 rinvB = gmx_mm_invsqrt_ps(rsqB);
605 rB = _mm_mul_ps(rsqB, rinvB);
607 /* Compute raj_inv aj1-4 */
608 raj_inv = gmx_mm_inv_ps(raj);
609 raj_invB = gmx_mm_inv_ps(rajB);
611 /* Evaluate influence of atom aj -> ai */
612 t1 = _mm_add_ps(r, sk_aj);
613 t2 = _mm_sub_ps(r, sk_aj);
614 t3 = _mm_sub_ps(sk_aj, r);
615 t1B = _mm_add_ps(rB, sk_ajB);
616 t2B = _mm_sub_ps(rB, sk_ajB);
617 t3B = _mm_sub_ps(sk_ajB, rB);
618 obc_mask1 = _mm_cmplt_ps(rai, t1);
619 obc_mask2 = _mm_cmplt_ps(rai, t2);
620 obc_mask3 = _mm_cmplt_ps(rai, t3);
621 obc_mask1B = _mm_cmplt_ps(rai, t1B);
622 obc_mask2B = _mm_cmplt_ps(rai, t2B);
623 obc_mask3B = _mm_cmplt_ps(rai, t3B);
625 uij = gmx_mm_inv_ps(t1);
626 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
627 _mm_andnot_ps(obc_mask2, rai_inv));
628 dlij = _mm_and_ps(one, obc_mask2);
629 uij2 = _mm_mul_ps(uij, uij);
630 uij3 = _mm_mul_ps(uij2, uij);
631 lij2 = _mm_mul_ps(lij, lij);
632 lij3 = _mm_mul_ps(lij2, lij);
634 uijB = gmx_mm_inv_ps(t1B);
635 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
636 _mm_andnot_ps(obc_mask2B, rai_inv));
637 dlijB = _mm_and_ps(one, obc_mask2B);
638 uij2B = _mm_mul_ps(uijB, uijB);
639 uij3B = _mm_mul_ps(uij2B, uijB);
640 lij2B = _mm_mul_ps(lijB, lijB);
641 lij3B = _mm_mul_ps(lij2B, lijB);
643 diff2 = _mm_sub_ps(uij2, lij2);
644 lij_inv = gmx_mm_invsqrt_ps(lij2);
645 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
646 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
647 prod = _mm_mul_ps(onefourth, sk2_rinv);
649 diff2B = _mm_sub_ps(uij2B, lij2B);
650 lij_invB = gmx_mm_invsqrt_ps(lij2B);
651 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
652 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
653 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
655 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
656 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
658 t1 = _mm_sub_ps(lij, uij);
659 t2 = _mm_mul_ps(diff2,
660 _mm_sub_ps(_mm_mul_ps(onefourth, r),
662 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
663 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
664 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
665 t4 = _mm_and_ps(t4, obc_mask3);
666 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
668 t1B = _mm_sub_ps(lijB, uijB);
669 t2B = _mm_mul_ps(diff2B,
670 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
672 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
673 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
674 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
675 t4B = _mm_and_ps(t4B, obc_mask3B);
676 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
678 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
680 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
681 _mm_mul_ps(prod, lij3));
683 _mm_mul_ps(onefourth,
684 _mm_add_ps(_mm_mul_ps(lij, rinv),
685 _mm_mul_ps(lij3, r))));
686 t2 = _mm_mul_ps(onefourth,
687 _mm_add_ps(_mm_mul_ps(uij, rinv),
688 _mm_mul_ps(uij3, r)));
690 _mm_add_ps(_mm_mul_ps(half, uij2),
691 _mm_mul_ps(prod, uij3)));
692 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
693 _mm_mul_ps(rinv, rinv));
695 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
697 _mm_mul_ps(sk2_rinv, rinv))));
698 t1 = _mm_mul_ps(rinv,
699 _mm_add_ps(_mm_mul_ps(dlij, t1),
700 _mm_add_ps(t2, t3)));
704 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
705 _mm_mul_ps(prodB, lij3B));
706 t1B = _mm_sub_ps(t1B,
707 _mm_mul_ps(onefourth,
708 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
709 _mm_mul_ps(lij3B, rB))));
710 t2B = _mm_mul_ps(onefourth,
711 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
712 _mm_mul_ps(uij3B, rB)));
713 t2B = _mm_sub_ps(t2B,
714 _mm_add_ps(_mm_mul_ps(half, uij2B),
715 _mm_mul_ps(prodB, uij3B)));
716 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
717 _mm_mul_ps(rinvB, rinvB));
718 t3B = _mm_sub_ps(t3B,
719 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
721 _mm_mul_ps(sk2_rinvB, rinvB))));
722 t1B = _mm_mul_ps(rinvB,
723 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
724 _mm_add_ps(t2B, t3B)));
726 dadx1 = _mm_and_ps(t1, obc_mask1);
727 dadx1B = _mm_and_ps(t1B, obc_mask1B);
730 /* Evaluate influence of atom ai -> aj */
731 t1 = _mm_add_ps(r, sk_ai);
732 t2 = _mm_sub_ps(r, sk_ai);
733 t3 = _mm_sub_ps(sk_ai, r);
734 t1B = _mm_add_ps(rB, sk_ai);
735 t2B = _mm_sub_ps(rB, sk_ai);
736 t3B = _mm_sub_ps(sk_ai, rB);
737 obc_mask1 = _mm_cmplt_ps(raj, t1);
738 obc_mask2 = _mm_cmplt_ps(raj, t2);
739 obc_mask3 = _mm_cmplt_ps(raj, t3);
740 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
741 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
742 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
744 uij = gmx_mm_inv_ps(t1);
745 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
746 _mm_andnot_ps(obc_mask2, raj_inv));
747 dlij = _mm_and_ps(one, obc_mask2);
748 uij2 = _mm_mul_ps(uij, uij);
749 uij3 = _mm_mul_ps(uij2, uij);
750 lij2 = _mm_mul_ps(lij, lij);
751 lij3 = _mm_mul_ps(lij2, lij);
753 uijB = gmx_mm_inv_ps(t1B);
754 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
755 _mm_andnot_ps(obc_mask2B, raj_invB));
756 dlijB = _mm_and_ps(one, obc_mask2B);
757 uij2B = _mm_mul_ps(uijB, uijB);
758 uij3B = _mm_mul_ps(uij2B, uijB);
759 lij2B = _mm_mul_ps(lijB, lijB);
760 lij3B = _mm_mul_ps(lij2B, lijB);
762 diff2 = _mm_sub_ps(uij2, lij2);
763 lij_inv = gmx_mm_invsqrt_ps(lij2);
764 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
765 prod = _mm_mul_ps(onefourth, sk2_rinv);
767 diff2B = _mm_sub_ps(uij2B, lij2B);
768 lij_invB = gmx_mm_invsqrt_ps(lij2B);
769 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
770 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
772 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
773 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
775 t1 = _mm_sub_ps(lij, uij);
776 t2 = _mm_mul_ps(diff2,
777 _mm_sub_ps(_mm_mul_ps(onefourth, r),
779 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
780 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
781 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
782 t4 = _mm_and_ps(t4, obc_mask3);
783 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
785 t1B = _mm_sub_ps(lijB, uijB);
786 t2B = _mm_mul_ps(diff2B,
787 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
789 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
790 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
791 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
792 t4B = _mm_and_ps(t4B, obc_mask3B);
793 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
795 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
796 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
798 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
799 _mm_mul_ps(prod, lij3));
801 _mm_mul_ps(onefourth,
802 _mm_add_ps(_mm_mul_ps(lij, rinv),
803 _mm_mul_ps(lij3, r))));
804 t2 = _mm_mul_ps(onefourth,
805 _mm_add_ps(_mm_mul_ps(uij, rinv),
806 _mm_mul_ps(uij3, r)));
808 _mm_add_ps(_mm_mul_ps(half, uij2),
809 _mm_mul_ps(prod, uij3)));
810 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
811 _mm_mul_ps(rinv, rinv));
813 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
815 _mm_mul_ps(sk2_rinv, rinv))));
816 t1 = _mm_mul_ps(rinv,
817 _mm_add_ps(_mm_mul_ps(dlij, t1),
818 _mm_add_ps(t2, t3)));
821 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
822 _mm_mul_ps(prodB, lij3B));
823 t1B = _mm_sub_ps(t1B,
824 _mm_mul_ps(onefourth,
825 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
826 _mm_mul_ps(lij3B, rB))));
827 t2B = _mm_mul_ps(onefourth,
828 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
829 _mm_mul_ps(uij3B, rB)));
830 t2B = _mm_sub_ps(t2B,
831 _mm_add_ps(_mm_mul_ps(half, uij2B),
832 _mm_mul_ps(prodB, uij3B)));
833 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
834 _mm_mul_ps(rinvB, rinvB));
835 t3B = _mm_sub_ps(t3B,
836 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
838 _mm_mul_ps(sk2_rinvB, rinvB))));
839 t1B = _mm_mul_ps(rinvB,
840 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
841 _mm_add_ps(t2B, t3B)));
844 dadx2 = _mm_and_ps(t1, obc_mask1);
845 dadx2B = _mm_and_ps(t1B, obc_mask1B);
847 _mm_store_ps(dadx, dadx1);
849 _mm_store_ps(dadx, dadx2);
851 _mm_store_ps(dadx, dadx1B);
853 _mm_store_ps(dadx, dadx2B);
856 } /* end normal inner loop */
858 for (; k < nj1-offset; k += 4)
870 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
871 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
872 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
874 dx = _mm_sub_ps(ix, jx);
875 dy = _mm_sub_ps(iy, jy);
876 dz = _mm_sub_ps(iz, jz);
878 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
880 rinv = gmx_mm_invsqrt_ps(rsq);
881 r = _mm_mul_ps(rsq, rinv);
883 /* Compute raj_inv aj1-4 */
884 raj_inv = gmx_mm_inv_ps(raj);
886 /* Evaluate influence of atom aj -> ai */
887 t1 = _mm_add_ps(r, sk_aj);
888 obc_mask1 = _mm_cmplt_ps(rai, t1);
890 if (_mm_movemask_ps(obc_mask1))
892 /* If any of the elements has rai<dr+sk, this is executed */
893 t2 = _mm_sub_ps(r, sk_aj);
894 t3 = _mm_sub_ps(sk_aj, r);
896 obc_mask2 = _mm_cmplt_ps(rai, t2);
897 obc_mask3 = _mm_cmplt_ps(rai, t3);
899 uij = gmx_mm_inv_ps(t1);
900 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
901 _mm_andnot_ps(obc_mask2, rai_inv));
902 dlij = _mm_and_ps(one, obc_mask2);
903 uij2 = _mm_mul_ps(uij, uij);
904 uij3 = _mm_mul_ps(uij2, uij);
905 lij2 = _mm_mul_ps(lij, lij);
906 lij3 = _mm_mul_ps(lij2, lij);
907 diff2 = _mm_sub_ps(uij2, lij2);
908 lij_inv = gmx_mm_invsqrt_ps(lij2);
909 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
910 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
911 prod = _mm_mul_ps(onefourth, sk2_rinv);
912 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
913 t1 = _mm_sub_ps(lij, uij);
914 t2 = _mm_mul_ps(diff2,
915 _mm_sub_ps(_mm_mul_ps(onefourth, r),
917 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
918 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
919 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
920 t4 = _mm_and_ps(t4, obc_mask3);
921 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
922 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
923 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
924 _mm_mul_ps(prod, lij3));
926 _mm_mul_ps(onefourth,
927 _mm_add_ps(_mm_mul_ps(lij, rinv),
928 _mm_mul_ps(lij3, r))));
929 t2 = _mm_mul_ps(onefourth,
930 _mm_add_ps(_mm_mul_ps(uij, rinv),
931 _mm_mul_ps(uij3, r)));
933 _mm_add_ps(_mm_mul_ps(half, uij2),
934 _mm_mul_ps(prod, uij3)));
935 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
936 _mm_mul_ps(rinv, rinv));
938 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
940 _mm_mul_ps(sk2_rinv, rinv))));
941 t1 = _mm_mul_ps(rinv,
942 _mm_add_ps(_mm_mul_ps(dlij, t1),
943 _mm_add_ps(t2, t3)));
945 dadx1 = _mm_and_ps(t1, obc_mask1);
949 dadx1 = _mm_setzero_ps();
952 /* Evaluate influence of atom ai -> aj */
953 t1 = _mm_add_ps(r, sk_ai);
954 obc_mask1 = _mm_cmplt_ps(raj, t1);
956 if (_mm_movemask_ps(obc_mask1))
958 t2 = _mm_sub_ps(r, sk_ai);
959 t3 = _mm_sub_ps(sk_ai, r);
960 obc_mask2 = _mm_cmplt_ps(raj, t2);
961 obc_mask3 = _mm_cmplt_ps(raj, t3);
963 uij = gmx_mm_inv_ps(t1);
964 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
965 _mm_andnot_ps(obc_mask2, raj_inv));
966 dlij = _mm_and_ps(one, obc_mask2);
967 uij2 = _mm_mul_ps(uij, uij);
968 uij3 = _mm_mul_ps(uij2, uij);
969 lij2 = _mm_mul_ps(lij, lij);
970 lij3 = _mm_mul_ps(lij2, lij);
971 diff2 = _mm_sub_ps(uij2, lij2);
972 lij_inv = gmx_mm_invsqrt_ps(lij2);
973 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
974 prod = _mm_mul_ps(onefourth, sk2_rinv);
975 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
976 t1 = _mm_sub_ps(lij, uij);
977 t2 = _mm_mul_ps(diff2,
978 _mm_sub_ps(_mm_mul_ps(onefourth, r),
980 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
981 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
982 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
983 t4 = _mm_and_ps(t4, obc_mask3);
984 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
986 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
988 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
989 _mm_mul_ps(prod, lij3));
991 _mm_mul_ps(onefourth,
992 _mm_add_ps(_mm_mul_ps(lij, rinv),
993 _mm_mul_ps(lij3, r))));
994 t2 = _mm_mul_ps(onefourth,
995 _mm_add_ps(_mm_mul_ps(uij, rinv),
996 _mm_mul_ps(uij3, r)));
998 _mm_add_ps(_mm_mul_ps(half, uij2),
999 _mm_mul_ps(prod, uij3)));
1000 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1001 _mm_mul_ps(rinv, rinv));
1003 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1005 _mm_mul_ps(sk2_rinv, rinv))));
1006 t1 = _mm_mul_ps(rinv,
1007 _mm_add_ps(_mm_mul_ps(dlij, t1),
1008 _mm_add_ps(t2, t3)));
1009 dadx2 = _mm_and_ps(t1, obc_mask1);
1013 dadx2 = _mm_setzero_ps();
1016 _mm_store_ps(dadx, dadx1);
1018 _mm_store_ps(dadx, dadx2);
1020 } /* end normal inner loop */
1028 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1029 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1030 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1033 else if (offset == 2)
1039 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1040 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1041 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1046 /* offset must be 3 */
1053 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1054 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1055 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1059 dx = _mm_sub_ps(ix, jx);
1060 dy = _mm_sub_ps(iy, jy);
1061 dz = _mm_sub_ps(iz, jz);
1063 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1065 rinv = gmx_mm_invsqrt_ps(rsq);
1066 r = _mm_mul_ps(rsq, rinv);
1068 /* Compute raj_inv aj1-4 */
1069 raj_inv = gmx_mm_inv_ps(raj);
1071 /* Evaluate influence of atom aj -> ai */
1072 t1 = _mm_add_ps(r, sk_aj);
1073 obc_mask1 = _mm_cmplt_ps(rai, t1);
1074 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1076 if (_mm_movemask_ps(obc_mask1))
1078 t2 = _mm_sub_ps(r, sk_aj);
1079 t3 = _mm_sub_ps(sk_aj, r);
1080 obc_mask2 = _mm_cmplt_ps(rai, t2);
1081 obc_mask3 = _mm_cmplt_ps(rai, t3);
1083 uij = gmx_mm_inv_ps(t1);
1084 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1085 _mm_andnot_ps(obc_mask2, rai_inv));
1086 dlij = _mm_and_ps(one, obc_mask2);
1087 uij2 = _mm_mul_ps(uij, uij);
1088 uij3 = _mm_mul_ps(uij2, uij);
1089 lij2 = _mm_mul_ps(lij, lij);
1090 lij3 = _mm_mul_ps(lij2, lij);
1091 diff2 = _mm_sub_ps(uij2, lij2);
1092 lij_inv = gmx_mm_invsqrt_ps(lij2);
1093 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1094 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1095 prod = _mm_mul_ps(onefourth, sk2_rinv);
1096 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1097 t1 = _mm_sub_ps(lij, uij);
1098 t2 = _mm_mul_ps(diff2,
1099 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1101 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1102 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1103 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1104 t4 = _mm_and_ps(t4, obc_mask3);
1105 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1106 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1107 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1108 _mm_mul_ps(prod, lij3));
1110 _mm_mul_ps(onefourth,
1111 _mm_add_ps(_mm_mul_ps(lij, rinv),
1112 _mm_mul_ps(lij3, r))));
1113 t2 = _mm_mul_ps(onefourth,
1114 _mm_add_ps(_mm_mul_ps(uij, rinv),
1115 _mm_mul_ps(uij3, r)));
1117 _mm_add_ps(_mm_mul_ps(half, uij2),
1118 _mm_mul_ps(prod, uij3)));
1119 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1120 _mm_mul_ps(rinv, rinv));
1122 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1124 _mm_mul_ps(sk2_rinv, rinv))));
1125 t1 = _mm_mul_ps(rinv,
1126 _mm_add_ps(_mm_mul_ps(dlij, t1),
1127 _mm_add_ps(t2, t3)));
1128 dadx1 = _mm_and_ps(t1, obc_mask1);
1132 dadx1 = _mm_setzero_ps();
1135 /* Evaluate influence of atom ai -> aj */
1136 t1 = _mm_add_ps(r, sk_ai);
1137 obc_mask1 = _mm_cmplt_ps(raj, t1);
1138 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1140 if (_mm_movemask_ps(obc_mask1))
1142 t2 = _mm_sub_ps(r, sk_ai);
1143 t3 = _mm_sub_ps(sk_ai, r);
1144 obc_mask2 = _mm_cmplt_ps(raj, t2);
1145 obc_mask3 = _mm_cmplt_ps(raj, t3);
1147 uij = gmx_mm_inv_ps(t1);
1148 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1149 _mm_andnot_ps(obc_mask2, raj_inv));
1150 dlij = _mm_and_ps(one, obc_mask2);
1151 uij2 = _mm_mul_ps(uij, uij);
1152 uij3 = _mm_mul_ps(uij2, uij);
1153 lij2 = _mm_mul_ps(lij, lij);
1154 lij3 = _mm_mul_ps(lij2, lij);
1155 diff2 = _mm_sub_ps(uij2, lij2);
1156 lij_inv = gmx_mm_invsqrt_ps(lij2);
1157 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1158 prod = _mm_mul_ps(onefourth, sk2_rinv);
1159 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1160 t1 = _mm_sub_ps(lij, uij);
1161 t2 = _mm_mul_ps(diff2,
1162 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1164 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1165 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1166 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1167 t4 = _mm_and_ps(t4, obc_mask3);
1168 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1170 tmp = _mm_and_ps(t1, obc_mask1);
1172 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1173 _mm_mul_ps(prod, lij3));
1175 _mm_mul_ps(onefourth,
1176 _mm_add_ps(_mm_mul_ps(lij, rinv),
1177 _mm_mul_ps(lij3, r))));
1178 t2 = _mm_mul_ps(onefourth,
1179 _mm_add_ps(_mm_mul_ps(uij, rinv),
1180 _mm_mul_ps(uij3, r)));
1182 _mm_add_ps(_mm_mul_ps(half, uij2),
1183 _mm_mul_ps(prod, uij3)));
1184 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1185 _mm_mul_ps(rinv, rinv));
1187 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1189 _mm_mul_ps(sk2_rinv, rinv))));
1190 t1 = _mm_mul_ps(rinv,
1191 _mm_add_ps(_mm_mul_ps(dlij, t1),
1192 _mm_add_ps(t2, t3)));
1193 dadx2 = _mm_and_ps(t1, obc_mask1);
1197 dadx2 = _mm_setzero_ps();
1198 tmp = _mm_setzero_ps();
1201 _mm_store_ps(dadx, dadx1);
1203 _mm_store_ps(dadx, dadx2);
1208 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1210 else if (offset == 2)
1212 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1216 /* offset must be 3 */
1217 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1221 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1225 /* Parallel summations */
1226 if (DOMAINDECOMP(cr))
1228 dd_atom_sum_real(cr->dd, work);
1231 if (gb_algorithm == egbHCT)
1234 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1236 if (born->use[i] != 0)
1238 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1239 sum = 1.0/rr - work[i];
1240 min_rad = rr + doffset;
1243 born->bRad[i] = rad > min_rad ? rad : min_rad;
1244 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1248 /* Extra communication required for DD */
1249 if (DOMAINDECOMP(cr))
1251 dd_atom_spread_real(cr->dd, born->bRad);
1252 dd_atom_spread_real(cr->dd, fr->invsqrta);
1258 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1260 if (born->use[i] != 0)
1262 rr = top->atomtypes.gb_radius[md->typeA[i]];
1270 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1271 born->bRad[i] = rr_inv - tsum*rr_inv2;
1272 born->bRad[i] = 1.0 / born->bRad[i];
1274 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1276 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1277 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1280 /* Extra (local) communication required for DD */
1281 if (DOMAINDECOMP(cr))
1283 dd_atom_spread_real(cr->dd, born->bRad);
1284 dd_atom_spread_real(cr->dd, fr->invsqrta);
1285 dd_atom_spread_real(cr->dd, born->drobc);
1296 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1297 float *x, float *f, float *fshift, float *shiftvec,
1298 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1300 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1301 int jnrA, jnrB, jnrC, jnrD;
1302 int j3A, j3B, j3C, j3D;
1303 int jnrE, jnrF, jnrG, jnrH;
1304 int j3E, j3F, j3G, j3H;
1307 float rbi, shX, shY, shZ;
1312 __m128 jxB, jyB, jzB;
1313 __m128 fix, fiy, fiz;
1316 __m128 dxB, dyB, dzB;
1317 __m128 txB, tyB, tzB;
1319 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1320 __m128 xmm1, xmm2, xmm3;
1322 const __m128 two = _mm_set1_ps(2.0f);
1328 /* Loop to get the proper form for the Born radius term, sse style */
1334 if (gb_algorithm == egbSTILL)
1336 for (i = n0; i < n1; i++)
1338 rbi = born->bRad[i];
1339 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1342 else if (gb_algorithm == egbHCT)
1344 for (i = n0; i < n1; i++)
1346 rbi = born->bRad[i];
1347 rb[i] = rbi * rbi * dvda[i];
1350 else if (gb_algorithm == egbOBC)
1352 for (i = n0; i < n1; i++)
1354 rbi = born->bRad[i];
1355 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1359 jz = _mm_setzero_ps();
1361 n = j3A = j3B = j3C = j3D = 0;
1363 for (i = 0; i < nl->nri; i++)
1367 is3 = 3*nl->shift[i];
1368 shX = shiftvec[is3];
1369 shY = shiftvec[is3+1];
1370 shZ = shiftvec[is3+2];
1371 nj0 = nl->jindex[i];
1372 nj1 = nl->jindex[i+1];
1374 ix = _mm_set1_ps(shX+x[ii3+0]);
1375 iy = _mm_set1_ps(shY+x[ii3+1]);
1376 iz = _mm_set1_ps(shZ+x[ii3+2]);
1378 offset = (nj1-nj0)%4;
1380 rbai = _mm_load1_ps(rb+ii);
1381 fix = _mm_setzero_ps();
1382 fiy = _mm_setzero_ps();
1383 fiz = _mm_setzero_ps();
1386 for (k = nj0; k < nj1-offset; k += 4)
1398 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1400 dx = _mm_sub_ps(ix, jx);
1401 dy = _mm_sub_ps(iy, jy);
1402 dz = _mm_sub_ps(iz, jz);
1404 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1406 /* load chain rule terms for j1-4 */
1407 f_gb = _mm_load_ps(dadx);
1409 f_gb_ai = _mm_load_ps(dadx);
1412 /* calculate scalar force */
1413 f_gb = _mm_mul_ps(f_gb, rbai);
1414 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1415 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1417 tx = _mm_mul_ps(f_gb, dx);
1418 ty = _mm_mul_ps(f_gb, dy);
1419 tz = _mm_mul_ps(f_gb, dz);
1421 fix = _mm_add_ps(fix, tx);
1422 fiy = _mm_add_ps(fiy, ty);
1423 fiz = _mm_add_ps(fiz, tz);
1425 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1428 /*deal with odd elements */
1435 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1436 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1438 else if (offset == 2)
1444 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1445 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1449 /* offset must be 3 */
1456 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1457 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1460 dx = _mm_sub_ps(ix, jx);
1461 dy = _mm_sub_ps(iy, jy);
1462 dz = _mm_sub_ps(iz, jz);
1464 /* load chain rule terms for j1-4 */
1465 f_gb = _mm_load_ps(dadx);
1467 f_gb_ai = _mm_load_ps(dadx);
1470 /* calculate scalar force */
1471 f_gb = _mm_mul_ps(f_gb, rbai);
1472 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1473 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1475 tx = _mm_mul_ps(f_gb, dx);
1476 ty = _mm_mul_ps(f_gb, dy);
1477 tz = _mm_mul_ps(f_gb, dz);
1479 fix = _mm_add_ps(fix, tx);
1480 fiy = _mm_add_ps(fiy, ty);
1481 fiz = _mm_add_ps(fiz, tz);
1485 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1487 else if (offset == 2)
1489 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1493 /* offset must be 3 */
1494 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1498 /* fix/fiy/fiz now contain four partial force terms, that all should be
1499 * added to the i particle forces and shift forces.
1501 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1509 /* keep compiler happy */
1510 int genborn_sse_dummy;
1512 #endif /* SSE intrinsics available */