2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
42 #include "gromacs/fileio/pdbio.h"
43 #include "gromacs/legacyheaders/domdec.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/legacyheaders/names.h"
46 #include "gromacs/legacyheaders/network.h"
47 #include "gromacs/legacyheaders/typedefs.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/math/vec.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
55 /* Only compile this file if SSE intrinsics are available */
56 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
58 #include "genborn_sse2_single.h"
60 #include <emmintrin.h>
61 #include <gmx_sse2_single.h>
65 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
66 int natoms, gmx_localtop_t *top,
67 float *x, t_nblist *nl,
70 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
71 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
72 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
93 __m128 rsq, rinv, rinv2, rinv4, rinv6;
94 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
95 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
96 __m128 ratioB, rajB, vajB, rvdwB;
97 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
98 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
99 __m128 mask, icf4, icf6, mask_cmp;
100 __m128 icf4B, icf6B, mask_cmpB;
102 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
103 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
104 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
106 const __m128 half = _mm_set1_ps(0.5f);
107 const __m128 three = _mm_set1_ps(3.0f);
108 const __m128 one = _mm_set1_ps(1.0f);
109 const __m128 two = _mm_set1_ps(2.0f);
110 const __m128 zero = _mm_set1_ps(0.0f);
111 const __m128 four = _mm_set1_ps(4.0f);
113 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
114 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
115 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
117 factor = 0.5 * ONE_4PI_EPS0;
119 gb_radius = born->gb_radius;
121 work = born->gpol_still_work;
123 shiftvec = fr->shift_vec[0];
126 jnrA = jnrB = jnrC = jnrD = 0;
127 jx = _mm_setzero_ps();
128 jy = _mm_setzero_ps();
129 jz = _mm_setzero_ps();
133 for (i = 0; i < natoms; i++)
138 for (i = 0; i < nl->nri; i++)
142 is3 = 3*nl->shift[i];
144 shY = shiftvec[is3+1];
145 shZ = shiftvec[is3+2];
147 nj1 = nl->jindex[i+1];
149 ix = _mm_set1_ps(shX+x[ii3+0]);
150 iy = _mm_set1_ps(shY+x[ii3+1]);
151 iz = _mm_set1_ps(shZ+x[ii3+2]);
153 offset = (nj1-nj0)%4;
155 /* Polarization energy for atom ai */
156 gpi = _mm_setzero_ps();
158 rai = _mm_load1_ps(gb_radius+ii);
159 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
161 for (k = nj0; k < nj1-4-offset; k += 8)
181 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
182 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
184 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
185 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
186 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
187 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
189 dx = _mm_sub_ps(ix, jx);
190 dy = _mm_sub_ps(iy, jy);
191 dz = _mm_sub_ps(iz, jz);
192 dxB = _mm_sub_ps(ix, jxB);
193 dyB = _mm_sub_ps(iy, jyB);
194 dzB = _mm_sub_ps(iz, jzB);
196 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
197 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
198 rinv = gmx_mm_invsqrt_ps(rsq);
199 rinvB = gmx_mm_invsqrt_ps(rsqB);
200 rinv2 = _mm_mul_ps(rinv, rinv);
201 rinv2B = _mm_mul_ps(rinvB, rinvB);
202 rinv4 = _mm_mul_ps(rinv2, rinv2);
203 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
204 rinv6 = _mm_mul_ps(rinv4, rinv2);
205 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
207 rvdw = _mm_add_ps(rai, raj);
208 rvdwB = _mm_add_ps(rai, rajB);
209 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
210 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
212 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
213 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
215 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
216 if (0 == _mm_movemask_ps(mask_cmp) )
218 /* if ratio>still_p5inv for ALL elements */
220 dccf = _mm_setzero_ps();
224 ratio = _mm_min_ps(ratio, still_p5inv);
225 theta = _mm_mul_ps(ratio, still_pip5);
226 gmx_mm_sincos_ps(theta, &sinq, &cosq);
227 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
228 ccf = _mm_mul_ps(term, term);
229 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
230 _mm_mul_ps(sinq, theta));
232 if (0 == _mm_movemask_ps(mask_cmpB) )
234 /* if ratio>still_p5inv for ALL elements */
236 dccfB = _mm_setzero_ps();
240 ratioB = _mm_min_ps(ratioB, still_p5inv);
241 thetaB = _mm_mul_ps(ratioB, still_pip5);
242 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
243 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
244 ccfB = _mm_mul_ps(termB, termB);
245 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
246 _mm_mul_ps(sinqB, thetaB));
249 prod = _mm_mul_ps(still_p4, vaj);
250 prodB = _mm_mul_ps(still_p4, vajB);
251 icf4 = _mm_mul_ps(ccf, rinv4);
252 icf4B = _mm_mul_ps(ccfB, rinv4B);
253 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
254 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
256 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
257 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
259 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
261 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
263 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
265 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
267 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
271 for (; k < nj1-offset; k += 4)
283 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
285 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
286 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
288 dx = _mm_sub_ps(ix, jx);
289 dy = _mm_sub_ps(iy, jy);
290 dz = _mm_sub_ps(iz, jz);
292 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
293 rinv = gmx_mm_invsqrt_ps(rsq);
294 rinv2 = _mm_mul_ps(rinv, rinv);
295 rinv4 = _mm_mul_ps(rinv2, rinv2);
296 rinv6 = _mm_mul_ps(rinv4, rinv2);
298 rvdw = _mm_add_ps(rai, raj);
299 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
301 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
303 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
304 if (0 == _mm_movemask_ps(mask_cmp))
306 /* if ratio>still_p5inv for ALL elements */
308 dccf = _mm_setzero_ps();
312 ratio = _mm_min_ps(ratio, still_p5inv);
313 theta = _mm_mul_ps(ratio, still_pip5);
314 gmx_mm_sincos_ps(theta, &sinq, &cosq);
315 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
316 ccf = _mm_mul_ps(term, term);
317 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
318 _mm_mul_ps(sinq, theta));
321 prod = _mm_mul_ps(still_p4, vaj);
322 icf4 = _mm_mul_ps(ccf, rinv4);
323 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
325 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
327 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
329 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
331 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
341 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
342 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
343 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
346 else if (offset == 2)
352 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
353 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
354 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
359 /* offset must be 3 */
366 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
367 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
368 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
372 dx = _mm_sub_ps(ix, jx);
373 dy = _mm_sub_ps(iy, jy);
374 dz = _mm_sub_ps(iz, jz);
376 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
377 rinv = gmx_mm_invsqrt_ps(rsq);
378 rinv2 = _mm_mul_ps(rinv, rinv);
379 rinv4 = _mm_mul_ps(rinv2, rinv2);
380 rinv6 = _mm_mul_ps(rinv4, rinv2);
382 rvdw = _mm_add_ps(rai, raj);
383 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
385 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
387 if (0 == _mm_movemask_ps(mask_cmp))
389 /* if ratio>still_p5inv for ALL elements */
391 dccf = _mm_setzero_ps();
395 ratio = _mm_min_ps(ratio, still_p5inv);
396 theta = _mm_mul_ps(ratio, still_pip5);
397 gmx_mm_sincos_ps(theta, &sinq, &cosq);
398 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
399 ccf = _mm_mul_ps(term, term);
400 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
401 _mm_mul_ps(sinq, theta));
404 prod = _mm_mul_ps(still_p4, vaj);
405 icf4 = _mm_mul_ps(ccf, rinv4);
406 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
408 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
410 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
412 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
415 tmp = _mm_mul_ps(prod_ai, icf4);
419 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
421 else if (offset == 2)
423 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
427 /* offset must be 3 */
428 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
431 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
434 /* Sum up the polarization energy from other nodes */
435 if (DOMAINDECOMP(cr))
437 dd_atom_sum_real(cr->dd, work);
440 /* Compute the radii */
441 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
443 if (born->use[i] != 0)
445 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
446 gpi2 = gpi_ai * gpi_ai;
447 born->bRad[i] = factor*gmx_invsqrt(gpi2);
448 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
452 /* Extra (local) communication required for DD */
453 if (DOMAINDECOMP(cr))
455 dd_atom_spread_real(cr->dd, born->bRad);
456 dd_atom_spread_real(cr->dd, fr->invsqrta);
464 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
465 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
467 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
468 int jnrA, jnrB, jnrC, jnrD;
469 int j3A, j3B, j3C, j3D;
470 int jnrE, jnrF, jnrG, jnrH;
471 int j3E, j3F, j3G, j3H;
473 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
474 float sum_ai2, sum_ai3, tsum, tchain, doffset;
483 __m128 ix, iy, iz, jx, jy, jz;
484 __m128 dx, dy, dz, t1, t2, t3, t4;
486 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
487 __m128 uij, lij2, uij2, lij3, uij3, diff2;
488 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
489 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
493 __m128 obc_mask1, obc_mask2, obc_mask3;
494 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
495 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
496 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
497 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
498 __m128 lij_invB, sk2_invB, prodB;
499 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
500 __m128 dadx1B, dadx2B;
502 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
504 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
505 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
506 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
508 __m128 oneeighth = _mm_set1_ps(0.125);
509 __m128 onefourth = _mm_set1_ps(0.25);
511 const __m128 half = _mm_set1_ps(0.5f);
512 const __m128 three = _mm_set1_ps(3.0f);
513 const __m128 one = _mm_set1_ps(1.0f);
514 const __m128 two = _mm_set1_ps(2.0f);
515 const __m128 zero = _mm_set1_ps(0.0f);
516 const __m128 neg = _mm_set1_ps(-1.0f);
518 /* Set the dielectric offset */
519 doffset = born->gb_doffset;
520 gb_radius = born->gb_radius;
521 obc_param = born->param;
522 work = born->gpol_hct_work;
525 shiftvec = fr->shift_vec[0];
527 jx = _mm_setzero_ps();
528 jy = _mm_setzero_ps();
529 jz = _mm_setzero_ps();
531 jnrA = jnrB = jnrC = jnrD = 0;
533 for (i = 0; i < born->nr; i++)
538 for (i = 0; i < nl->nri; i++)
542 is3 = 3*nl->shift[i];
544 shY = shiftvec[is3+1];
545 shZ = shiftvec[is3+2];
547 nj1 = nl->jindex[i+1];
549 ix = _mm_set1_ps(shX+x[ii3+0]);
550 iy = _mm_set1_ps(shY+x[ii3+1]);
551 iz = _mm_set1_ps(shZ+x[ii3+2]);
553 offset = (nj1-nj0)%4;
555 rai = _mm_load1_ps(gb_radius+ii);
556 rai_inv = gmx_mm_inv_ps(rai);
558 sum_ai = _mm_setzero_ps();
560 sk_ai = _mm_load1_ps(born->param+ii);
561 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
563 for (k = nj0; k < nj1-4-offset; k += 8)
583 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
584 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
585 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
586 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
587 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
588 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
590 dx = _mm_sub_ps(ix, jx);
591 dy = _mm_sub_ps(iy, jy);
592 dz = _mm_sub_ps(iz, jz);
593 dxB = _mm_sub_ps(ix, jxB);
594 dyB = _mm_sub_ps(iy, jyB);
595 dzB = _mm_sub_ps(iz, jzB);
597 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
598 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
600 rinv = gmx_mm_invsqrt_ps(rsq);
601 r = _mm_mul_ps(rsq, rinv);
602 rinvB = gmx_mm_invsqrt_ps(rsqB);
603 rB = _mm_mul_ps(rsqB, rinvB);
605 /* Compute raj_inv aj1-4 */
606 raj_inv = gmx_mm_inv_ps(raj);
607 raj_invB = gmx_mm_inv_ps(rajB);
609 /* Evaluate influence of atom aj -> ai */
610 t1 = _mm_add_ps(r, sk_aj);
611 t2 = _mm_sub_ps(r, sk_aj);
612 t3 = _mm_sub_ps(sk_aj, r);
613 t1B = _mm_add_ps(rB, sk_ajB);
614 t2B = _mm_sub_ps(rB, sk_ajB);
615 t3B = _mm_sub_ps(sk_ajB, rB);
616 obc_mask1 = _mm_cmplt_ps(rai, t1);
617 obc_mask2 = _mm_cmplt_ps(rai, t2);
618 obc_mask3 = _mm_cmplt_ps(rai, t3);
619 obc_mask1B = _mm_cmplt_ps(rai, t1B);
620 obc_mask2B = _mm_cmplt_ps(rai, t2B);
621 obc_mask3B = _mm_cmplt_ps(rai, t3B);
623 uij = gmx_mm_inv_ps(t1);
624 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
625 _mm_andnot_ps(obc_mask2, rai_inv));
626 dlij = _mm_and_ps(one, obc_mask2);
627 uij2 = _mm_mul_ps(uij, uij);
628 uij3 = _mm_mul_ps(uij2, uij);
629 lij2 = _mm_mul_ps(lij, lij);
630 lij3 = _mm_mul_ps(lij2, lij);
632 uijB = gmx_mm_inv_ps(t1B);
633 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
634 _mm_andnot_ps(obc_mask2B, rai_inv));
635 dlijB = _mm_and_ps(one, obc_mask2B);
636 uij2B = _mm_mul_ps(uijB, uijB);
637 uij3B = _mm_mul_ps(uij2B, uijB);
638 lij2B = _mm_mul_ps(lijB, lijB);
639 lij3B = _mm_mul_ps(lij2B, lijB);
641 diff2 = _mm_sub_ps(uij2, lij2);
642 lij_inv = gmx_mm_invsqrt_ps(lij2);
643 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
644 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
645 prod = _mm_mul_ps(onefourth, sk2_rinv);
647 diff2B = _mm_sub_ps(uij2B, lij2B);
648 lij_invB = gmx_mm_invsqrt_ps(lij2B);
649 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
650 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
651 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
653 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
654 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
656 t1 = _mm_sub_ps(lij, uij);
657 t2 = _mm_mul_ps(diff2,
658 _mm_sub_ps(_mm_mul_ps(onefourth, r),
660 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
661 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
662 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
663 t4 = _mm_and_ps(t4, obc_mask3);
664 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
666 t1B = _mm_sub_ps(lijB, uijB);
667 t2B = _mm_mul_ps(diff2B,
668 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
670 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
671 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
672 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
673 t4B = _mm_and_ps(t4B, obc_mask3B);
674 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
676 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
678 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
679 _mm_mul_ps(prod, lij3));
681 _mm_mul_ps(onefourth,
682 _mm_add_ps(_mm_mul_ps(lij, rinv),
683 _mm_mul_ps(lij3, r))));
684 t2 = _mm_mul_ps(onefourth,
685 _mm_add_ps(_mm_mul_ps(uij, rinv),
686 _mm_mul_ps(uij3, r)));
688 _mm_add_ps(_mm_mul_ps(half, uij2),
689 _mm_mul_ps(prod, uij3)));
690 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
691 _mm_mul_ps(rinv, rinv));
693 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
695 _mm_mul_ps(sk2_rinv, rinv))));
696 t1 = _mm_mul_ps(rinv,
697 _mm_add_ps(_mm_mul_ps(dlij, t1),
698 _mm_add_ps(t2, t3)));
702 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
703 _mm_mul_ps(prodB, lij3B));
704 t1B = _mm_sub_ps(t1B,
705 _mm_mul_ps(onefourth,
706 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
707 _mm_mul_ps(lij3B, rB))));
708 t2B = _mm_mul_ps(onefourth,
709 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
710 _mm_mul_ps(uij3B, rB)));
711 t2B = _mm_sub_ps(t2B,
712 _mm_add_ps(_mm_mul_ps(half, uij2B),
713 _mm_mul_ps(prodB, uij3B)));
714 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
715 _mm_mul_ps(rinvB, rinvB));
716 t3B = _mm_sub_ps(t3B,
717 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
719 _mm_mul_ps(sk2_rinvB, rinvB))));
720 t1B = _mm_mul_ps(rinvB,
721 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
722 _mm_add_ps(t2B, t3B)));
724 dadx1 = _mm_and_ps(t1, obc_mask1);
725 dadx1B = _mm_and_ps(t1B, obc_mask1B);
728 /* Evaluate influence of atom ai -> aj */
729 t1 = _mm_add_ps(r, sk_ai);
730 t2 = _mm_sub_ps(r, sk_ai);
731 t3 = _mm_sub_ps(sk_ai, r);
732 t1B = _mm_add_ps(rB, sk_ai);
733 t2B = _mm_sub_ps(rB, sk_ai);
734 t3B = _mm_sub_ps(sk_ai, rB);
735 obc_mask1 = _mm_cmplt_ps(raj, t1);
736 obc_mask2 = _mm_cmplt_ps(raj, t2);
737 obc_mask3 = _mm_cmplt_ps(raj, t3);
738 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
739 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
740 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
742 uij = gmx_mm_inv_ps(t1);
743 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
744 _mm_andnot_ps(obc_mask2, raj_inv));
745 dlij = _mm_and_ps(one, obc_mask2);
746 uij2 = _mm_mul_ps(uij, uij);
747 uij3 = _mm_mul_ps(uij2, uij);
748 lij2 = _mm_mul_ps(lij, lij);
749 lij3 = _mm_mul_ps(lij2, lij);
751 uijB = gmx_mm_inv_ps(t1B);
752 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
753 _mm_andnot_ps(obc_mask2B, raj_invB));
754 dlijB = _mm_and_ps(one, obc_mask2B);
755 uij2B = _mm_mul_ps(uijB, uijB);
756 uij3B = _mm_mul_ps(uij2B, uijB);
757 lij2B = _mm_mul_ps(lijB, lijB);
758 lij3B = _mm_mul_ps(lij2B, lijB);
760 diff2 = _mm_sub_ps(uij2, lij2);
761 lij_inv = gmx_mm_invsqrt_ps(lij2);
762 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
763 prod = _mm_mul_ps(onefourth, sk2_rinv);
765 diff2B = _mm_sub_ps(uij2B, lij2B);
766 lij_invB = gmx_mm_invsqrt_ps(lij2B);
767 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
768 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
770 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
771 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
773 t1 = _mm_sub_ps(lij, uij);
774 t2 = _mm_mul_ps(diff2,
775 _mm_sub_ps(_mm_mul_ps(onefourth, r),
777 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
778 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
779 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
780 t4 = _mm_and_ps(t4, obc_mask3);
781 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
783 t1B = _mm_sub_ps(lijB, uijB);
784 t2B = _mm_mul_ps(diff2B,
785 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
787 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
788 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
789 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
790 t4B = _mm_and_ps(t4B, obc_mask3B);
791 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
793 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
794 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
796 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
797 _mm_mul_ps(prod, lij3));
799 _mm_mul_ps(onefourth,
800 _mm_add_ps(_mm_mul_ps(lij, rinv),
801 _mm_mul_ps(lij3, r))));
802 t2 = _mm_mul_ps(onefourth,
803 _mm_add_ps(_mm_mul_ps(uij, rinv),
804 _mm_mul_ps(uij3, r)));
806 _mm_add_ps(_mm_mul_ps(half, uij2),
807 _mm_mul_ps(prod, uij3)));
808 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
809 _mm_mul_ps(rinv, rinv));
811 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
813 _mm_mul_ps(sk2_rinv, rinv))));
814 t1 = _mm_mul_ps(rinv,
815 _mm_add_ps(_mm_mul_ps(dlij, t1),
816 _mm_add_ps(t2, t3)));
819 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
820 _mm_mul_ps(prodB, lij3B));
821 t1B = _mm_sub_ps(t1B,
822 _mm_mul_ps(onefourth,
823 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
824 _mm_mul_ps(lij3B, rB))));
825 t2B = _mm_mul_ps(onefourth,
826 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
827 _mm_mul_ps(uij3B, rB)));
828 t2B = _mm_sub_ps(t2B,
829 _mm_add_ps(_mm_mul_ps(half, uij2B),
830 _mm_mul_ps(prodB, uij3B)));
831 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
832 _mm_mul_ps(rinvB, rinvB));
833 t3B = _mm_sub_ps(t3B,
834 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
836 _mm_mul_ps(sk2_rinvB, rinvB))));
837 t1B = _mm_mul_ps(rinvB,
838 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
839 _mm_add_ps(t2B, t3B)));
842 dadx2 = _mm_and_ps(t1, obc_mask1);
843 dadx2B = _mm_and_ps(t1B, obc_mask1B);
845 _mm_store_ps(dadx, dadx1);
847 _mm_store_ps(dadx, dadx2);
849 _mm_store_ps(dadx, dadx1B);
851 _mm_store_ps(dadx, dadx2B);
854 } /* end normal inner loop */
856 for (; k < nj1-offset; k += 4)
868 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
869 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
870 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
872 dx = _mm_sub_ps(ix, jx);
873 dy = _mm_sub_ps(iy, jy);
874 dz = _mm_sub_ps(iz, jz);
876 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
878 rinv = gmx_mm_invsqrt_ps(rsq);
879 r = _mm_mul_ps(rsq, rinv);
881 /* Compute raj_inv aj1-4 */
882 raj_inv = gmx_mm_inv_ps(raj);
884 /* Evaluate influence of atom aj -> ai */
885 t1 = _mm_add_ps(r, sk_aj);
886 obc_mask1 = _mm_cmplt_ps(rai, t1);
888 if (_mm_movemask_ps(obc_mask1))
890 /* If any of the elements has rai<dr+sk, this is executed */
891 t2 = _mm_sub_ps(r, sk_aj);
892 t3 = _mm_sub_ps(sk_aj, r);
894 obc_mask2 = _mm_cmplt_ps(rai, t2);
895 obc_mask3 = _mm_cmplt_ps(rai, t3);
897 uij = gmx_mm_inv_ps(t1);
898 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
899 _mm_andnot_ps(obc_mask2, rai_inv));
900 dlij = _mm_and_ps(one, obc_mask2);
901 uij2 = _mm_mul_ps(uij, uij);
902 uij3 = _mm_mul_ps(uij2, uij);
903 lij2 = _mm_mul_ps(lij, lij);
904 lij3 = _mm_mul_ps(lij2, lij);
905 diff2 = _mm_sub_ps(uij2, lij2);
906 lij_inv = gmx_mm_invsqrt_ps(lij2);
907 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
908 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
909 prod = _mm_mul_ps(onefourth, sk2_rinv);
910 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
911 t1 = _mm_sub_ps(lij, uij);
912 t2 = _mm_mul_ps(diff2,
913 _mm_sub_ps(_mm_mul_ps(onefourth, r),
915 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
916 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
917 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
918 t4 = _mm_and_ps(t4, obc_mask3);
919 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
920 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
921 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
922 _mm_mul_ps(prod, lij3));
924 _mm_mul_ps(onefourth,
925 _mm_add_ps(_mm_mul_ps(lij, rinv),
926 _mm_mul_ps(lij3, r))));
927 t2 = _mm_mul_ps(onefourth,
928 _mm_add_ps(_mm_mul_ps(uij, rinv),
929 _mm_mul_ps(uij3, r)));
931 _mm_add_ps(_mm_mul_ps(half, uij2),
932 _mm_mul_ps(prod, uij3)));
933 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
934 _mm_mul_ps(rinv, rinv));
936 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
938 _mm_mul_ps(sk2_rinv, rinv))));
939 t1 = _mm_mul_ps(rinv,
940 _mm_add_ps(_mm_mul_ps(dlij, t1),
941 _mm_add_ps(t2, t3)));
943 dadx1 = _mm_and_ps(t1, obc_mask1);
947 dadx1 = _mm_setzero_ps();
950 /* Evaluate influence of atom ai -> aj */
951 t1 = _mm_add_ps(r, sk_ai);
952 obc_mask1 = _mm_cmplt_ps(raj, t1);
954 if (_mm_movemask_ps(obc_mask1))
956 t2 = _mm_sub_ps(r, sk_ai);
957 t3 = _mm_sub_ps(sk_ai, r);
958 obc_mask2 = _mm_cmplt_ps(raj, t2);
959 obc_mask3 = _mm_cmplt_ps(raj, t3);
961 uij = gmx_mm_inv_ps(t1);
962 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
963 _mm_andnot_ps(obc_mask2, raj_inv));
964 dlij = _mm_and_ps(one, obc_mask2);
965 uij2 = _mm_mul_ps(uij, uij);
966 uij3 = _mm_mul_ps(uij2, uij);
967 lij2 = _mm_mul_ps(lij, lij);
968 lij3 = _mm_mul_ps(lij2, lij);
969 diff2 = _mm_sub_ps(uij2, lij2);
970 lij_inv = gmx_mm_invsqrt_ps(lij2);
971 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
972 prod = _mm_mul_ps(onefourth, sk2_rinv);
973 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
974 t1 = _mm_sub_ps(lij, uij);
975 t2 = _mm_mul_ps(diff2,
976 _mm_sub_ps(_mm_mul_ps(onefourth, r),
978 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
979 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
980 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
981 t4 = _mm_and_ps(t4, obc_mask3);
982 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
984 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
986 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
987 _mm_mul_ps(prod, lij3));
989 _mm_mul_ps(onefourth,
990 _mm_add_ps(_mm_mul_ps(lij, rinv),
991 _mm_mul_ps(lij3, r))));
992 t2 = _mm_mul_ps(onefourth,
993 _mm_add_ps(_mm_mul_ps(uij, rinv),
994 _mm_mul_ps(uij3, r)));
996 _mm_add_ps(_mm_mul_ps(half, uij2),
997 _mm_mul_ps(prod, uij3)));
998 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
999 _mm_mul_ps(rinv, rinv));
1001 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1003 _mm_mul_ps(sk2_rinv, rinv))));
1004 t1 = _mm_mul_ps(rinv,
1005 _mm_add_ps(_mm_mul_ps(dlij, t1),
1006 _mm_add_ps(t2, t3)));
1007 dadx2 = _mm_and_ps(t1, obc_mask1);
1011 dadx2 = _mm_setzero_ps();
1014 _mm_store_ps(dadx, dadx1);
1016 _mm_store_ps(dadx, dadx2);
1018 } /* end normal inner loop */
1026 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1027 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1028 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1031 else if (offset == 2)
1037 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1038 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1039 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1044 /* offset must be 3 */
1051 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1052 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1053 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1057 dx = _mm_sub_ps(ix, jx);
1058 dy = _mm_sub_ps(iy, jy);
1059 dz = _mm_sub_ps(iz, jz);
1061 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1063 rinv = gmx_mm_invsqrt_ps(rsq);
1064 r = _mm_mul_ps(rsq, rinv);
1066 /* Compute raj_inv aj1-4 */
1067 raj_inv = gmx_mm_inv_ps(raj);
1069 /* Evaluate influence of atom aj -> ai */
1070 t1 = _mm_add_ps(r, sk_aj);
1071 obc_mask1 = _mm_cmplt_ps(rai, t1);
1072 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1074 if (_mm_movemask_ps(obc_mask1))
1076 t2 = _mm_sub_ps(r, sk_aj);
1077 t3 = _mm_sub_ps(sk_aj, r);
1078 obc_mask2 = _mm_cmplt_ps(rai, t2);
1079 obc_mask3 = _mm_cmplt_ps(rai, t3);
1081 uij = gmx_mm_inv_ps(t1);
1082 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1083 _mm_andnot_ps(obc_mask2, rai_inv));
1084 dlij = _mm_and_ps(one, obc_mask2);
1085 uij2 = _mm_mul_ps(uij, uij);
1086 uij3 = _mm_mul_ps(uij2, uij);
1087 lij2 = _mm_mul_ps(lij, lij);
1088 lij3 = _mm_mul_ps(lij2, lij);
1089 diff2 = _mm_sub_ps(uij2, lij2);
1090 lij_inv = gmx_mm_invsqrt_ps(lij2);
1091 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1092 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1093 prod = _mm_mul_ps(onefourth, sk2_rinv);
1094 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1095 t1 = _mm_sub_ps(lij, uij);
1096 t2 = _mm_mul_ps(diff2,
1097 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1099 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1100 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1101 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1102 t4 = _mm_and_ps(t4, obc_mask3);
1103 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1104 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1105 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1106 _mm_mul_ps(prod, lij3));
1108 _mm_mul_ps(onefourth,
1109 _mm_add_ps(_mm_mul_ps(lij, rinv),
1110 _mm_mul_ps(lij3, r))));
1111 t2 = _mm_mul_ps(onefourth,
1112 _mm_add_ps(_mm_mul_ps(uij, rinv),
1113 _mm_mul_ps(uij3, r)));
1115 _mm_add_ps(_mm_mul_ps(half, uij2),
1116 _mm_mul_ps(prod, uij3)));
1117 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1118 _mm_mul_ps(rinv, rinv));
1120 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1122 _mm_mul_ps(sk2_rinv, rinv))));
1123 t1 = _mm_mul_ps(rinv,
1124 _mm_add_ps(_mm_mul_ps(dlij, t1),
1125 _mm_add_ps(t2, t3)));
1126 dadx1 = _mm_and_ps(t1, obc_mask1);
1130 dadx1 = _mm_setzero_ps();
1133 /* Evaluate influence of atom ai -> aj */
1134 t1 = _mm_add_ps(r, sk_ai);
1135 obc_mask1 = _mm_cmplt_ps(raj, t1);
1136 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1138 if (_mm_movemask_ps(obc_mask1))
1140 t2 = _mm_sub_ps(r, sk_ai);
1141 t3 = _mm_sub_ps(sk_ai, r);
1142 obc_mask2 = _mm_cmplt_ps(raj, t2);
1143 obc_mask3 = _mm_cmplt_ps(raj, t3);
1145 uij = gmx_mm_inv_ps(t1);
1146 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1147 _mm_andnot_ps(obc_mask2, raj_inv));
1148 dlij = _mm_and_ps(one, obc_mask2);
1149 uij2 = _mm_mul_ps(uij, uij);
1150 uij3 = _mm_mul_ps(uij2, uij);
1151 lij2 = _mm_mul_ps(lij, lij);
1152 lij3 = _mm_mul_ps(lij2, lij);
1153 diff2 = _mm_sub_ps(uij2, lij2);
1154 lij_inv = gmx_mm_invsqrt_ps(lij2);
1155 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1156 prod = _mm_mul_ps(onefourth, sk2_rinv);
1157 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1158 t1 = _mm_sub_ps(lij, uij);
1159 t2 = _mm_mul_ps(diff2,
1160 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1162 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1163 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1164 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1165 t4 = _mm_and_ps(t4, obc_mask3);
1166 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1168 tmp = _mm_and_ps(t1, obc_mask1);
1170 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1171 _mm_mul_ps(prod, lij3));
1173 _mm_mul_ps(onefourth,
1174 _mm_add_ps(_mm_mul_ps(lij, rinv),
1175 _mm_mul_ps(lij3, r))));
1176 t2 = _mm_mul_ps(onefourth,
1177 _mm_add_ps(_mm_mul_ps(uij, rinv),
1178 _mm_mul_ps(uij3, r)));
1180 _mm_add_ps(_mm_mul_ps(half, uij2),
1181 _mm_mul_ps(prod, uij3)));
1182 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1183 _mm_mul_ps(rinv, rinv));
1185 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1187 _mm_mul_ps(sk2_rinv, rinv))));
1188 t1 = _mm_mul_ps(rinv,
1189 _mm_add_ps(_mm_mul_ps(dlij, t1),
1190 _mm_add_ps(t2, t3)));
1191 dadx2 = _mm_and_ps(t1, obc_mask1);
1195 dadx2 = _mm_setzero_ps();
1196 tmp = _mm_setzero_ps();
1199 _mm_store_ps(dadx, dadx1);
1201 _mm_store_ps(dadx, dadx2);
1206 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1208 else if (offset == 2)
1210 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1214 /* offset must be 3 */
1215 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1219 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1223 /* Parallel summations */
1224 if (DOMAINDECOMP(cr))
1226 dd_atom_sum_real(cr->dd, work);
1229 if (gb_algorithm == egbHCT)
1232 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1234 if (born->use[i] != 0)
1236 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1237 sum = 1.0/rr - work[i];
1238 min_rad = rr + doffset;
1241 born->bRad[i] = rad > min_rad ? rad : min_rad;
1242 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1246 /* Extra communication required for DD */
1247 if (DOMAINDECOMP(cr))
1249 dd_atom_spread_real(cr->dd, born->bRad);
1250 dd_atom_spread_real(cr->dd, fr->invsqrta);
1256 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1258 if (born->use[i] != 0)
1260 rr = top->atomtypes.gb_radius[md->typeA[i]];
1268 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1269 born->bRad[i] = rr_inv - tsum*rr_inv2;
1270 born->bRad[i] = 1.0 / born->bRad[i];
1272 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1274 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1275 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1278 /* Extra (local) communication required for DD */
1279 if (DOMAINDECOMP(cr))
1281 dd_atom_spread_real(cr->dd, born->bRad);
1282 dd_atom_spread_real(cr->dd, fr->invsqrta);
1283 dd_atom_spread_real(cr->dd, born->drobc);
1294 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1295 float *x, float *f, float *fshift, float *shiftvec,
1296 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1298 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1299 int jnrA, jnrB, jnrC, jnrD;
1300 int j3A, j3B, j3C, j3D;
1301 int jnrE, jnrF, jnrG, jnrH;
1302 int j3E, j3F, j3G, j3H;
1305 float rbi, shX, shY, shZ;
1310 __m128 jxB, jyB, jzB;
1311 __m128 fix, fiy, fiz;
1314 __m128 dxB, dyB, dzB;
1315 __m128 txB, tyB, tzB;
1317 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1318 __m128 xmm1, xmm2, xmm3;
1320 const __m128 two = _mm_set1_ps(2.0f);
1326 /* Loop to get the proper form for the Born radius term, sse style */
1332 if (gb_algorithm == egbSTILL)
1334 for (i = n0; i < n1; i++)
1336 rbi = born->bRad[i];
1337 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1340 else if (gb_algorithm == egbHCT)
1342 for (i = n0; i < n1; i++)
1344 rbi = born->bRad[i];
1345 rb[i] = rbi * rbi * dvda[i];
1348 else if (gb_algorithm == egbOBC)
1350 for (i = n0; i < n1; i++)
1352 rbi = born->bRad[i];
1353 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1357 jz = _mm_setzero_ps();
1359 n = j3A = j3B = j3C = j3D = 0;
1361 for (i = 0; i < nl->nri; i++)
1365 is3 = 3*nl->shift[i];
1366 shX = shiftvec[is3];
1367 shY = shiftvec[is3+1];
1368 shZ = shiftvec[is3+2];
1369 nj0 = nl->jindex[i];
1370 nj1 = nl->jindex[i+1];
1372 ix = _mm_set1_ps(shX+x[ii3+0]);
1373 iy = _mm_set1_ps(shY+x[ii3+1]);
1374 iz = _mm_set1_ps(shZ+x[ii3+2]);
1376 offset = (nj1-nj0)%4;
1378 rbai = _mm_load1_ps(rb+ii);
1379 fix = _mm_setzero_ps();
1380 fiy = _mm_setzero_ps();
1381 fiz = _mm_setzero_ps();
1384 for (k = nj0; k < nj1-offset; k += 4)
1396 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1398 dx = _mm_sub_ps(ix, jx);
1399 dy = _mm_sub_ps(iy, jy);
1400 dz = _mm_sub_ps(iz, jz);
1402 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1404 /* load chain rule terms for j1-4 */
1405 f_gb = _mm_load_ps(dadx);
1407 f_gb_ai = _mm_load_ps(dadx);
1410 /* calculate scalar force */
1411 f_gb = _mm_mul_ps(f_gb, rbai);
1412 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1413 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1415 tx = _mm_mul_ps(f_gb, dx);
1416 ty = _mm_mul_ps(f_gb, dy);
1417 tz = _mm_mul_ps(f_gb, dz);
1419 fix = _mm_add_ps(fix, tx);
1420 fiy = _mm_add_ps(fiy, ty);
1421 fiz = _mm_add_ps(fiz, tz);
1423 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1426 /*deal with odd elements */
1433 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1434 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1436 else if (offset == 2)
1442 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1443 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1447 /* offset must be 3 */
1454 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1455 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1458 dx = _mm_sub_ps(ix, jx);
1459 dy = _mm_sub_ps(iy, jy);
1460 dz = _mm_sub_ps(iz, jz);
1462 /* load chain rule terms for j1-4 */
1463 f_gb = _mm_load_ps(dadx);
1465 f_gb_ai = _mm_load_ps(dadx);
1468 /* calculate scalar force */
1469 f_gb = _mm_mul_ps(f_gb, rbai);
1470 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1471 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1473 tx = _mm_mul_ps(f_gb, dx);
1474 ty = _mm_mul_ps(f_gb, dy);
1475 tz = _mm_mul_ps(f_gb, dz);
1477 fix = _mm_add_ps(fix, tx);
1478 fiy = _mm_add_ps(fiy, ty);
1479 fiz = _mm_add_ps(fiz, tz);
1483 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1485 else if (offset == 2)
1487 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1491 /* offset must be 3 */
1492 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1496 /* fix/fiy/fiz now contain four partial force terms, that all should be
1497 * added to the i particle forces and shift forces.
1499 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1507 /* keep compiler happy */
1508 int genborn_sse_dummy;
1510 #endif /* SSE intrinsics available */