2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
45 #include "gromacs/utility/smalloc.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/fileio/pdbio.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "mtop_util.h"
57 #include "gromacs/utility/gmxmpi.h"
60 /* Only compile this file if SSE intrinsics are available */
61 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
63 #include <gmx_sse2_single.h>
64 #include <emmintrin.h>
66 #include "genborn_sse2_single.h"
70 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
71 int natoms, gmx_localtop_t *top,
72 float *x, t_nblist *nl,
75 int i, k, n, ii, is3, ii3, nj0, nj1, offset;
76 int jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
77 int jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
98 __m128 rsq, rinv, rinv2, rinv4, rinv6;
99 __m128 rsqB, rinvB, rinv2B, rinv4B, rinv6B;
100 __m128 ratio, gpi, rai, raj, vai, vaj, rvdw;
101 __m128 ratioB, rajB, vajB, rvdwB;
102 __m128 ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
103 __m128 ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
104 __m128 mask, icf4, icf6, mask_cmp;
105 __m128 icf4B, icf6B, mask_cmpB;
107 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
108 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
109 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
111 const __m128 half = _mm_set1_ps(0.5f);
112 const __m128 three = _mm_set1_ps(3.0f);
113 const __m128 one = _mm_set1_ps(1.0f);
114 const __m128 two = _mm_set1_ps(2.0f);
115 const __m128 zero = _mm_set1_ps(0.0f);
116 const __m128 four = _mm_set1_ps(4.0f);
118 const __m128 still_p5inv = _mm_set1_ps(STILL_P5INV);
119 const __m128 still_pip5 = _mm_set1_ps(STILL_PIP5);
120 const __m128 still_p4 = _mm_set1_ps(STILL_P4);
122 factor = 0.5 * ONE_4PI_EPS0;
124 gb_radius = born->gb_radius;
126 work = born->gpol_still_work;
128 shiftvec = fr->shift_vec[0];
131 jnrA = jnrB = jnrC = jnrD = 0;
132 jx = _mm_setzero_ps();
133 jy = _mm_setzero_ps();
134 jz = _mm_setzero_ps();
138 for (i = 0; i < natoms; i++)
143 for (i = 0; i < nl->nri; i++)
147 is3 = 3*nl->shift[i];
149 shY = shiftvec[is3+1];
150 shZ = shiftvec[is3+2];
152 nj1 = nl->jindex[i+1];
154 ix = _mm_set1_ps(shX+x[ii3+0]);
155 iy = _mm_set1_ps(shY+x[ii3+1]);
156 iz = _mm_set1_ps(shZ+x[ii3+2]);
158 offset = (nj1-nj0)%4;
160 /* Polarization energy for atom ai */
161 gpi = _mm_setzero_ps();
163 rai = _mm_load1_ps(gb_radius+ii);
164 prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
166 for (k = nj0; k < nj1-4-offset; k += 8)
186 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
187 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
189 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
190 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
191 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
192 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
194 dx = _mm_sub_ps(ix, jx);
195 dy = _mm_sub_ps(iy, jy);
196 dz = _mm_sub_ps(iz, jz);
197 dxB = _mm_sub_ps(ix, jxB);
198 dyB = _mm_sub_ps(iy, jyB);
199 dzB = _mm_sub_ps(iz, jzB);
201 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
202 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
203 rinv = gmx_mm_invsqrt_ps(rsq);
204 rinvB = gmx_mm_invsqrt_ps(rsqB);
205 rinv2 = _mm_mul_ps(rinv, rinv);
206 rinv2B = _mm_mul_ps(rinvB, rinvB);
207 rinv4 = _mm_mul_ps(rinv2, rinv2);
208 rinv4B = _mm_mul_ps(rinv2B, rinv2B);
209 rinv6 = _mm_mul_ps(rinv4, rinv2);
210 rinv6B = _mm_mul_ps(rinv4B, rinv2B);
212 rvdw = _mm_add_ps(rai, raj);
213 rvdwB = _mm_add_ps(rai, rajB);
214 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
215 ratioB = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
217 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
218 mask_cmpB = _mm_cmple_ps(ratioB, still_p5inv);
220 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
221 if (0 == _mm_movemask_ps(mask_cmp) )
223 /* if ratio>still_p5inv for ALL elements */
225 dccf = _mm_setzero_ps();
229 ratio = _mm_min_ps(ratio, still_p5inv);
230 theta = _mm_mul_ps(ratio, still_pip5);
231 gmx_mm_sincos_ps(theta, &sinq, &cosq);
232 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
233 ccf = _mm_mul_ps(term, term);
234 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
235 _mm_mul_ps(sinq, theta));
237 if (0 == _mm_movemask_ps(mask_cmpB) )
239 /* if ratio>still_p5inv for ALL elements */
241 dccfB = _mm_setzero_ps();
245 ratioB = _mm_min_ps(ratioB, still_p5inv);
246 thetaB = _mm_mul_ps(ratioB, still_pip5);
247 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
248 termB = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
249 ccfB = _mm_mul_ps(termB, termB);
250 dccfB = _mm_mul_ps(_mm_mul_ps(two, termB),
251 _mm_mul_ps(sinqB, thetaB));
254 prod = _mm_mul_ps(still_p4, vaj);
255 prodB = _mm_mul_ps(still_p4, vajB);
256 icf4 = _mm_mul_ps(ccf, rinv4);
257 icf4B = _mm_mul_ps(ccfB, rinv4B);
258 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
259 icf6B = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
261 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
262 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
264 gpi = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
266 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
268 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
270 _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
272 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
276 for (; k < nj1-offset; k += 4)
288 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
290 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
291 GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
293 dx = _mm_sub_ps(ix, jx);
294 dy = _mm_sub_ps(iy, jy);
295 dz = _mm_sub_ps(iz, jz);
297 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
298 rinv = gmx_mm_invsqrt_ps(rsq);
299 rinv2 = _mm_mul_ps(rinv, rinv);
300 rinv4 = _mm_mul_ps(rinv2, rinv2);
301 rinv6 = _mm_mul_ps(rinv4, rinv2);
303 rvdw = _mm_add_ps(rai, raj);
304 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
306 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
308 /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
309 if (0 == _mm_movemask_ps(mask_cmp))
311 /* if ratio>still_p5inv for ALL elements */
313 dccf = _mm_setzero_ps();
317 ratio = _mm_min_ps(ratio, still_p5inv);
318 theta = _mm_mul_ps(ratio, still_pip5);
319 gmx_mm_sincos_ps(theta, &sinq, &cosq);
320 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
321 ccf = _mm_mul_ps(term, term);
322 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
323 _mm_mul_ps(sinq, theta));
326 prod = _mm_mul_ps(still_p4, vaj);
327 icf4 = _mm_mul_ps(ccf, rinv4);
328 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
330 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
332 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
334 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
336 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
346 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
347 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
348 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
351 else if (offset == 2)
357 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
358 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
359 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
364 /* offset must be 3 */
371 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
372 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
373 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
377 dx = _mm_sub_ps(ix, jx);
378 dy = _mm_sub_ps(iy, jy);
379 dz = _mm_sub_ps(iz, jz);
381 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
382 rinv = gmx_mm_invsqrt_ps(rsq);
383 rinv2 = _mm_mul_ps(rinv, rinv);
384 rinv4 = _mm_mul_ps(rinv2, rinv2);
385 rinv6 = _mm_mul_ps(rinv4, rinv2);
387 rvdw = _mm_add_ps(rai, raj);
388 ratio = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
390 mask_cmp = _mm_cmple_ps(ratio, still_p5inv);
392 if (0 == _mm_movemask_ps(mask_cmp))
394 /* if ratio>still_p5inv for ALL elements */
396 dccf = _mm_setzero_ps();
400 ratio = _mm_min_ps(ratio, still_p5inv);
401 theta = _mm_mul_ps(ratio, still_pip5);
402 gmx_mm_sincos_ps(theta, &sinq, &cosq);
403 term = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
404 ccf = _mm_mul_ps(term, term);
405 dccf = _mm_mul_ps(_mm_mul_ps(two, term),
406 _mm_mul_ps(sinq, theta));
409 prod = _mm_mul_ps(still_p4, vaj);
410 icf4 = _mm_mul_ps(ccf, rinv4);
411 icf6 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
413 gpi = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
415 _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
417 _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
420 tmp = _mm_mul_ps(prod_ai, icf4);
424 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
426 else if (offset == 2)
428 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
432 /* offset must be 3 */
433 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
436 GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
439 /* Sum up the polarization energy from other nodes */
440 if (DOMAINDECOMP(cr))
442 dd_atom_sum_real(cr->dd, work);
445 /* Compute the radii */
446 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
448 if (born->use[i] != 0)
450 gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
451 gpi2 = gpi_ai * gpi_ai;
452 born->bRad[i] = factor*gmx_invsqrt(gpi2);
453 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
457 /* Extra (local) communication required for DD */
458 if (DOMAINDECOMP(cr))
460 dd_atom_spread_real(cr->dd, born->bRad);
461 dd_atom_spread_real(cr->dd, fr->invsqrta);
469 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
470 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
472 int i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
473 int jnrA, jnrB, jnrC, jnrD;
474 int j3A, j3B, j3C, j3D;
475 int jnrE, jnrF, jnrG, jnrH;
476 int j3E, j3F, j3G, j3H;
478 float rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
479 float sum_ai2, sum_ai3, tsum, tchain, doffset;
488 __m128 ix, iy, iz, jx, jy, jz;
489 __m128 dx, dy, dz, t1, t2, t3, t4;
491 __m128 rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
492 __m128 uij, lij2, uij2, lij3, uij3, diff2;
493 __m128 lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
494 __m128 sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
498 __m128 obc_mask1, obc_mask2, obc_mask3;
499 __m128 jxB, jyB, jzB, t1B, t2B, t3B, t4B;
500 __m128 dxB, dyB, dzB, rsqB, rinvB, rB;
501 __m128 rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
502 __m128 uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
503 __m128 lij_invB, sk2_invB, prodB;
504 __m128 sk_ajB, sk2_ajB, sk2_rinvB;
505 __m128 dadx1B, dadx2B;
507 __m128 obc_mask1B, obc_mask2B, obc_mask3B;
509 __m128 mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
510 __m128 mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
511 __m128 mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
513 __m128 oneeighth = _mm_set1_ps(0.125);
514 __m128 onefourth = _mm_set1_ps(0.25);
516 const __m128 half = _mm_set1_ps(0.5f);
517 const __m128 three = _mm_set1_ps(3.0f);
518 const __m128 one = _mm_set1_ps(1.0f);
519 const __m128 two = _mm_set1_ps(2.0f);
520 const __m128 zero = _mm_set1_ps(0.0f);
521 const __m128 neg = _mm_set1_ps(-1.0f);
523 /* Set the dielectric offset */
524 doffset = born->gb_doffset;
525 gb_radius = born->gb_radius;
526 obc_param = born->param;
527 work = born->gpol_hct_work;
530 shiftvec = fr->shift_vec[0];
532 jx = _mm_setzero_ps();
533 jy = _mm_setzero_ps();
534 jz = _mm_setzero_ps();
536 jnrA = jnrB = jnrC = jnrD = 0;
538 for (i = 0; i < born->nr; i++)
543 for (i = 0; i < nl->nri; i++)
547 is3 = 3*nl->shift[i];
549 shY = shiftvec[is3+1];
550 shZ = shiftvec[is3+2];
552 nj1 = nl->jindex[i+1];
554 ix = _mm_set1_ps(shX+x[ii3+0]);
555 iy = _mm_set1_ps(shY+x[ii3+1]);
556 iz = _mm_set1_ps(shZ+x[ii3+2]);
558 offset = (nj1-nj0)%4;
560 rai = _mm_load1_ps(gb_radius+ii);
561 rai_inv = gmx_mm_inv_ps(rai);
563 sum_ai = _mm_setzero_ps();
565 sk_ai = _mm_load1_ps(born->param+ii);
566 sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
568 for (k = nj0; k < nj1-4-offset; k += 8)
588 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
589 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
590 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
591 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
592 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
593 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
595 dx = _mm_sub_ps(ix, jx);
596 dy = _mm_sub_ps(iy, jy);
597 dz = _mm_sub_ps(iz, jz);
598 dxB = _mm_sub_ps(ix, jxB);
599 dyB = _mm_sub_ps(iy, jyB);
600 dzB = _mm_sub_ps(iz, jzB);
602 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
603 rsqB = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
605 rinv = gmx_mm_invsqrt_ps(rsq);
606 r = _mm_mul_ps(rsq, rinv);
607 rinvB = gmx_mm_invsqrt_ps(rsqB);
608 rB = _mm_mul_ps(rsqB, rinvB);
610 /* Compute raj_inv aj1-4 */
611 raj_inv = gmx_mm_inv_ps(raj);
612 raj_invB = gmx_mm_inv_ps(rajB);
614 /* Evaluate influence of atom aj -> ai */
615 t1 = _mm_add_ps(r, sk_aj);
616 t2 = _mm_sub_ps(r, sk_aj);
617 t3 = _mm_sub_ps(sk_aj, r);
618 t1B = _mm_add_ps(rB, sk_ajB);
619 t2B = _mm_sub_ps(rB, sk_ajB);
620 t3B = _mm_sub_ps(sk_ajB, rB);
621 obc_mask1 = _mm_cmplt_ps(rai, t1);
622 obc_mask2 = _mm_cmplt_ps(rai, t2);
623 obc_mask3 = _mm_cmplt_ps(rai, t3);
624 obc_mask1B = _mm_cmplt_ps(rai, t1B);
625 obc_mask2B = _mm_cmplt_ps(rai, t2B);
626 obc_mask3B = _mm_cmplt_ps(rai, t3B);
628 uij = gmx_mm_inv_ps(t1);
629 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
630 _mm_andnot_ps(obc_mask2, rai_inv));
631 dlij = _mm_and_ps(one, obc_mask2);
632 uij2 = _mm_mul_ps(uij, uij);
633 uij3 = _mm_mul_ps(uij2, uij);
634 lij2 = _mm_mul_ps(lij, lij);
635 lij3 = _mm_mul_ps(lij2, lij);
637 uijB = gmx_mm_inv_ps(t1B);
638 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
639 _mm_andnot_ps(obc_mask2B, rai_inv));
640 dlijB = _mm_and_ps(one, obc_mask2B);
641 uij2B = _mm_mul_ps(uijB, uijB);
642 uij3B = _mm_mul_ps(uij2B, uijB);
643 lij2B = _mm_mul_ps(lijB, lijB);
644 lij3B = _mm_mul_ps(lij2B, lijB);
646 diff2 = _mm_sub_ps(uij2, lij2);
647 lij_inv = gmx_mm_invsqrt_ps(lij2);
648 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
649 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
650 prod = _mm_mul_ps(onefourth, sk2_rinv);
652 diff2B = _mm_sub_ps(uij2B, lij2B);
653 lij_invB = gmx_mm_invsqrt_ps(lij2B);
654 sk2_ajB = _mm_mul_ps(sk_ajB, sk_ajB);
655 sk2_rinvB = _mm_mul_ps(sk2_ajB, rinvB);
656 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
658 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
659 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
661 t1 = _mm_sub_ps(lij, uij);
662 t2 = _mm_mul_ps(diff2,
663 _mm_sub_ps(_mm_mul_ps(onefourth, r),
665 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
666 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
667 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
668 t4 = _mm_and_ps(t4, obc_mask3);
669 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
671 t1B = _mm_sub_ps(lijB, uijB);
672 t2B = _mm_mul_ps(diff2B,
673 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
675 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
676 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
677 t4B = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
678 t4B = _mm_and_ps(t4B, obc_mask3B);
679 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
681 sum_ai = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
683 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
684 _mm_mul_ps(prod, lij3));
686 _mm_mul_ps(onefourth,
687 _mm_add_ps(_mm_mul_ps(lij, rinv),
688 _mm_mul_ps(lij3, r))));
689 t2 = _mm_mul_ps(onefourth,
690 _mm_add_ps(_mm_mul_ps(uij, rinv),
691 _mm_mul_ps(uij3, r)));
693 _mm_add_ps(_mm_mul_ps(half, uij2),
694 _mm_mul_ps(prod, uij3)));
695 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
696 _mm_mul_ps(rinv, rinv));
698 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
700 _mm_mul_ps(sk2_rinv, rinv))));
701 t1 = _mm_mul_ps(rinv,
702 _mm_add_ps(_mm_mul_ps(dlij, t1),
703 _mm_add_ps(t2, t3)));
707 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
708 _mm_mul_ps(prodB, lij3B));
709 t1B = _mm_sub_ps(t1B,
710 _mm_mul_ps(onefourth,
711 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
712 _mm_mul_ps(lij3B, rB))));
713 t2B = _mm_mul_ps(onefourth,
714 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
715 _mm_mul_ps(uij3B, rB)));
716 t2B = _mm_sub_ps(t2B,
717 _mm_add_ps(_mm_mul_ps(half, uij2B),
718 _mm_mul_ps(prodB, uij3B)));
719 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
720 _mm_mul_ps(rinvB, rinvB));
721 t3B = _mm_sub_ps(t3B,
722 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
724 _mm_mul_ps(sk2_rinvB, rinvB))));
725 t1B = _mm_mul_ps(rinvB,
726 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
727 _mm_add_ps(t2B, t3B)));
729 dadx1 = _mm_and_ps(t1, obc_mask1);
730 dadx1B = _mm_and_ps(t1B, obc_mask1B);
733 /* Evaluate influence of atom ai -> aj */
734 t1 = _mm_add_ps(r, sk_ai);
735 t2 = _mm_sub_ps(r, sk_ai);
736 t3 = _mm_sub_ps(sk_ai, r);
737 t1B = _mm_add_ps(rB, sk_ai);
738 t2B = _mm_sub_ps(rB, sk_ai);
739 t3B = _mm_sub_ps(sk_ai, rB);
740 obc_mask1 = _mm_cmplt_ps(raj, t1);
741 obc_mask2 = _mm_cmplt_ps(raj, t2);
742 obc_mask3 = _mm_cmplt_ps(raj, t3);
743 obc_mask1B = _mm_cmplt_ps(rajB, t1B);
744 obc_mask2B = _mm_cmplt_ps(rajB, t2B);
745 obc_mask3B = _mm_cmplt_ps(rajB, t3B);
747 uij = gmx_mm_inv_ps(t1);
748 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
749 _mm_andnot_ps(obc_mask2, raj_inv));
750 dlij = _mm_and_ps(one, obc_mask2);
751 uij2 = _mm_mul_ps(uij, uij);
752 uij3 = _mm_mul_ps(uij2, uij);
753 lij2 = _mm_mul_ps(lij, lij);
754 lij3 = _mm_mul_ps(lij2, lij);
756 uijB = gmx_mm_inv_ps(t1B);
757 lijB = _mm_or_ps( _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
758 _mm_andnot_ps(obc_mask2B, raj_invB));
759 dlijB = _mm_and_ps(one, obc_mask2B);
760 uij2B = _mm_mul_ps(uijB, uijB);
761 uij3B = _mm_mul_ps(uij2B, uijB);
762 lij2B = _mm_mul_ps(lijB, lijB);
763 lij3B = _mm_mul_ps(lij2B, lijB);
765 diff2 = _mm_sub_ps(uij2, lij2);
766 lij_inv = gmx_mm_invsqrt_ps(lij2);
767 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
768 prod = _mm_mul_ps(onefourth, sk2_rinv);
770 diff2B = _mm_sub_ps(uij2B, lij2B);
771 lij_invB = gmx_mm_invsqrt_ps(lij2B);
772 sk2_rinvB = _mm_mul_ps(sk2_ai, rinvB);
773 prodB = _mm_mul_ps(onefourth, sk2_rinvB);
775 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
776 logtermB = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
778 t1 = _mm_sub_ps(lij, uij);
779 t2 = _mm_mul_ps(diff2,
780 _mm_sub_ps(_mm_mul_ps(onefourth, r),
782 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
783 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
784 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
785 t4 = _mm_and_ps(t4, obc_mask3);
786 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
788 t1B = _mm_sub_ps(lijB, uijB);
789 t2B = _mm_mul_ps(diff2B,
790 _mm_sub_ps(_mm_mul_ps(onefourth, rB),
792 t3B = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
793 t1B = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
794 t4B = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
795 t4B = _mm_and_ps(t4B, obc_mask3B);
796 t1B = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
798 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
799 GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
801 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
802 _mm_mul_ps(prod, lij3));
804 _mm_mul_ps(onefourth,
805 _mm_add_ps(_mm_mul_ps(lij, rinv),
806 _mm_mul_ps(lij3, r))));
807 t2 = _mm_mul_ps(onefourth,
808 _mm_add_ps(_mm_mul_ps(uij, rinv),
809 _mm_mul_ps(uij3, r)));
811 _mm_add_ps(_mm_mul_ps(half, uij2),
812 _mm_mul_ps(prod, uij3)));
813 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
814 _mm_mul_ps(rinv, rinv));
816 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
818 _mm_mul_ps(sk2_rinv, rinv))));
819 t1 = _mm_mul_ps(rinv,
820 _mm_add_ps(_mm_mul_ps(dlij, t1),
821 _mm_add_ps(t2, t3)));
824 t1B = _mm_add_ps(_mm_mul_ps(half, lij2B),
825 _mm_mul_ps(prodB, lij3B));
826 t1B = _mm_sub_ps(t1B,
827 _mm_mul_ps(onefourth,
828 _mm_add_ps(_mm_mul_ps(lijB, rinvB),
829 _mm_mul_ps(lij3B, rB))));
830 t2B = _mm_mul_ps(onefourth,
831 _mm_add_ps(_mm_mul_ps(uijB, rinvB),
832 _mm_mul_ps(uij3B, rB)));
833 t2B = _mm_sub_ps(t2B,
834 _mm_add_ps(_mm_mul_ps(half, uij2B),
835 _mm_mul_ps(prodB, uij3B)));
836 t3B = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
837 _mm_mul_ps(rinvB, rinvB));
838 t3B = _mm_sub_ps(t3B,
839 _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
841 _mm_mul_ps(sk2_rinvB, rinvB))));
842 t1B = _mm_mul_ps(rinvB,
843 _mm_add_ps(_mm_mul_ps(dlijB, t1B),
844 _mm_add_ps(t2B, t3B)));
847 dadx2 = _mm_and_ps(t1, obc_mask1);
848 dadx2B = _mm_and_ps(t1B, obc_mask1B);
850 _mm_store_ps(dadx, dadx1);
852 _mm_store_ps(dadx, dadx2);
854 _mm_store_ps(dadx, dadx1B);
856 _mm_store_ps(dadx, dadx2B);
859 } /* end normal inner loop */
861 for (; k < nj1-offset; k += 4)
873 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
874 GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
875 GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
877 dx = _mm_sub_ps(ix, jx);
878 dy = _mm_sub_ps(iy, jy);
879 dz = _mm_sub_ps(iz, jz);
881 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
883 rinv = gmx_mm_invsqrt_ps(rsq);
884 r = _mm_mul_ps(rsq, rinv);
886 /* Compute raj_inv aj1-4 */
887 raj_inv = gmx_mm_inv_ps(raj);
889 /* Evaluate influence of atom aj -> ai */
890 t1 = _mm_add_ps(r, sk_aj);
891 obc_mask1 = _mm_cmplt_ps(rai, t1);
893 if (_mm_movemask_ps(obc_mask1))
895 /* If any of the elements has rai<dr+sk, this is executed */
896 t2 = _mm_sub_ps(r, sk_aj);
897 t3 = _mm_sub_ps(sk_aj, r);
899 obc_mask2 = _mm_cmplt_ps(rai, t2);
900 obc_mask3 = _mm_cmplt_ps(rai, t3);
902 uij = gmx_mm_inv_ps(t1);
903 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
904 _mm_andnot_ps(obc_mask2, rai_inv));
905 dlij = _mm_and_ps(one, obc_mask2);
906 uij2 = _mm_mul_ps(uij, uij);
907 uij3 = _mm_mul_ps(uij2, uij);
908 lij2 = _mm_mul_ps(lij, lij);
909 lij3 = _mm_mul_ps(lij2, lij);
910 diff2 = _mm_sub_ps(uij2, lij2);
911 lij_inv = gmx_mm_invsqrt_ps(lij2);
912 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
913 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
914 prod = _mm_mul_ps(onefourth, sk2_rinv);
915 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
916 t1 = _mm_sub_ps(lij, uij);
917 t2 = _mm_mul_ps(diff2,
918 _mm_sub_ps(_mm_mul_ps(onefourth, r),
920 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
921 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
922 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
923 t4 = _mm_and_ps(t4, obc_mask3);
924 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
925 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
926 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
927 _mm_mul_ps(prod, lij3));
929 _mm_mul_ps(onefourth,
930 _mm_add_ps(_mm_mul_ps(lij, rinv),
931 _mm_mul_ps(lij3, r))));
932 t2 = _mm_mul_ps(onefourth,
933 _mm_add_ps(_mm_mul_ps(uij, rinv),
934 _mm_mul_ps(uij3, r)));
936 _mm_add_ps(_mm_mul_ps(half, uij2),
937 _mm_mul_ps(prod, uij3)));
938 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
939 _mm_mul_ps(rinv, rinv));
941 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
943 _mm_mul_ps(sk2_rinv, rinv))));
944 t1 = _mm_mul_ps(rinv,
945 _mm_add_ps(_mm_mul_ps(dlij, t1),
946 _mm_add_ps(t2, t3)));
948 dadx1 = _mm_and_ps(t1, obc_mask1);
952 dadx1 = _mm_setzero_ps();
955 /* Evaluate influence of atom ai -> aj */
956 t1 = _mm_add_ps(r, sk_ai);
957 obc_mask1 = _mm_cmplt_ps(raj, t1);
959 if (_mm_movemask_ps(obc_mask1))
961 t2 = _mm_sub_ps(r, sk_ai);
962 t3 = _mm_sub_ps(sk_ai, r);
963 obc_mask2 = _mm_cmplt_ps(raj, t2);
964 obc_mask3 = _mm_cmplt_ps(raj, t3);
966 uij = gmx_mm_inv_ps(t1);
967 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
968 _mm_andnot_ps(obc_mask2, raj_inv));
969 dlij = _mm_and_ps(one, obc_mask2);
970 uij2 = _mm_mul_ps(uij, uij);
971 uij3 = _mm_mul_ps(uij2, uij);
972 lij2 = _mm_mul_ps(lij, lij);
973 lij3 = _mm_mul_ps(lij2, lij);
974 diff2 = _mm_sub_ps(uij2, lij2);
975 lij_inv = gmx_mm_invsqrt_ps(lij2);
976 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
977 prod = _mm_mul_ps(onefourth, sk2_rinv);
978 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
979 t1 = _mm_sub_ps(lij, uij);
980 t2 = _mm_mul_ps(diff2,
981 _mm_sub_ps(_mm_mul_ps(onefourth, r),
983 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
984 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
985 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
986 t4 = _mm_and_ps(t4, obc_mask3);
987 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
989 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
991 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
992 _mm_mul_ps(prod, lij3));
994 _mm_mul_ps(onefourth,
995 _mm_add_ps(_mm_mul_ps(lij, rinv),
996 _mm_mul_ps(lij3, r))));
997 t2 = _mm_mul_ps(onefourth,
998 _mm_add_ps(_mm_mul_ps(uij, rinv),
999 _mm_mul_ps(uij3, r)));
1001 _mm_add_ps(_mm_mul_ps(half, uij2),
1002 _mm_mul_ps(prod, uij3)));
1003 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1004 _mm_mul_ps(rinv, rinv));
1006 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1008 _mm_mul_ps(sk2_rinv, rinv))));
1009 t1 = _mm_mul_ps(rinv,
1010 _mm_add_ps(_mm_mul_ps(dlij, t1),
1011 _mm_add_ps(t2, t3)));
1012 dadx2 = _mm_and_ps(t1, obc_mask1);
1016 dadx2 = _mm_setzero_ps();
1019 _mm_store_ps(dadx, dadx1);
1021 _mm_store_ps(dadx, dadx2);
1023 } /* end normal inner loop */
1031 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1032 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1033 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1036 else if (offset == 2)
1042 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1043 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1044 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1049 /* offset must be 3 */
1056 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1057 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1058 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1062 dx = _mm_sub_ps(ix, jx);
1063 dy = _mm_sub_ps(iy, jy);
1064 dz = _mm_sub_ps(iz, jz);
1066 rsq = gmx_mm_calc_rsq_ps(dx, dy, dz);
1068 rinv = gmx_mm_invsqrt_ps(rsq);
1069 r = _mm_mul_ps(rsq, rinv);
1071 /* Compute raj_inv aj1-4 */
1072 raj_inv = gmx_mm_inv_ps(raj);
1074 /* Evaluate influence of atom aj -> ai */
1075 t1 = _mm_add_ps(r, sk_aj);
1076 obc_mask1 = _mm_cmplt_ps(rai, t1);
1077 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1079 if (_mm_movemask_ps(obc_mask1))
1081 t2 = _mm_sub_ps(r, sk_aj);
1082 t3 = _mm_sub_ps(sk_aj, r);
1083 obc_mask2 = _mm_cmplt_ps(rai, t2);
1084 obc_mask3 = _mm_cmplt_ps(rai, t3);
1086 uij = gmx_mm_inv_ps(t1);
1087 lij = _mm_or_ps( _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1088 _mm_andnot_ps(obc_mask2, rai_inv));
1089 dlij = _mm_and_ps(one, obc_mask2);
1090 uij2 = _mm_mul_ps(uij, uij);
1091 uij3 = _mm_mul_ps(uij2, uij);
1092 lij2 = _mm_mul_ps(lij, lij);
1093 lij3 = _mm_mul_ps(lij2, lij);
1094 diff2 = _mm_sub_ps(uij2, lij2);
1095 lij_inv = gmx_mm_invsqrt_ps(lij2);
1096 sk2_aj = _mm_mul_ps(sk_aj, sk_aj);
1097 sk2_rinv = _mm_mul_ps(sk2_aj, rinv);
1098 prod = _mm_mul_ps(onefourth, sk2_rinv);
1099 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1100 t1 = _mm_sub_ps(lij, uij);
1101 t2 = _mm_mul_ps(diff2,
1102 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1104 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1105 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1106 t4 = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1107 t4 = _mm_and_ps(t4, obc_mask3);
1108 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1109 sum_ai = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1110 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1111 _mm_mul_ps(prod, lij3));
1113 _mm_mul_ps(onefourth,
1114 _mm_add_ps(_mm_mul_ps(lij, rinv),
1115 _mm_mul_ps(lij3, r))));
1116 t2 = _mm_mul_ps(onefourth,
1117 _mm_add_ps(_mm_mul_ps(uij, rinv),
1118 _mm_mul_ps(uij3, r)));
1120 _mm_add_ps(_mm_mul_ps(half, uij2),
1121 _mm_mul_ps(prod, uij3)));
1122 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1123 _mm_mul_ps(rinv, rinv));
1125 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1127 _mm_mul_ps(sk2_rinv, rinv))));
1128 t1 = _mm_mul_ps(rinv,
1129 _mm_add_ps(_mm_mul_ps(dlij, t1),
1130 _mm_add_ps(t2, t3)));
1131 dadx1 = _mm_and_ps(t1, obc_mask1);
1135 dadx1 = _mm_setzero_ps();
1138 /* Evaluate influence of atom ai -> aj */
1139 t1 = _mm_add_ps(r, sk_ai);
1140 obc_mask1 = _mm_cmplt_ps(raj, t1);
1141 obc_mask1 = _mm_and_ps(obc_mask1, mask);
1143 if (_mm_movemask_ps(obc_mask1))
1145 t2 = _mm_sub_ps(r, sk_ai);
1146 t3 = _mm_sub_ps(sk_ai, r);
1147 obc_mask2 = _mm_cmplt_ps(raj, t2);
1148 obc_mask3 = _mm_cmplt_ps(raj, t3);
1150 uij = gmx_mm_inv_ps(t1);
1151 lij = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1152 _mm_andnot_ps(obc_mask2, raj_inv));
1153 dlij = _mm_and_ps(one, obc_mask2);
1154 uij2 = _mm_mul_ps(uij, uij);
1155 uij3 = _mm_mul_ps(uij2, uij);
1156 lij2 = _mm_mul_ps(lij, lij);
1157 lij3 = _mm_mul_ps(lij2, lij);
1158 diff2 = _mm_sub_ps(uij2, lij2);
1159 lij_inv = gmx_mm_invsqrt_ps(lij2);
1160 sk2_rinv = _mm_mul_ps(sk2_ai, rinv);
1161 prod = _mm_mul_ps(onefourth, sk2_rinv);
1162 logterm = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1163 t1 = _mm_sub_ps(lij, uij);
1164 t2 = _mm_mul_ps(diff2,
1165 _mm_sub_ps(_mm_mul_ps(onefourth, r),
1167 t3 = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1168 t1 = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1169 t4 = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1170 t4 = _mm_and_ps(t4, obc_mask3);
1171 t1 = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1173 tmp = _mm_and_ps(t1, obc_mask1);
1175 t1 = _mm_add_ps(_mm_mul_ps(half, lij2),
1176 _mm_mul_ps(prod, lij3));
1178 _mm_mul_ps(onefourth,
1179 _mm_add_ps(_mm_mul_ps(lij, rinv),
1180 _mm_mul_ps(lij3, r))));
1181 t2 = _mm_mul_ps(onefourth,
1182 _mm_add_ps(_mm_mul_ps(uij, rinv),
1183 _mm_mul_ps(uij3, r)));
1185 _mm_add_ps(_mm_mul_ps(half, uij2),
1186 _mm_mul_ps(prod, uij3)));
1187 t3 = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1188 _mm_mul_ps(rinv, rinv));
1190 _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1192 _mm_mul_ps(sk2_rinv, rinv))));
1193 t1 = _mm_mul_ps(rinv,
1194 _mm_add_ps(_mm_mul_ps(dlij, t1),
1195 _mm_add_ps(t2, t3)));
1196 dadx2 = _mm_and_ps(t1, obc_mask1);
1200 dadx2 = _mm_setzero_ps();
1201 tmp = _mm_setzero_ps();
1204 _mm_store_ps(dadx, dadx1);
1206 _mm_store_ps(dadx, dadx2);
1211 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1213 else if (offset == 2)
1215 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1219 /* offset must be 3 */
1220 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1224 GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1228 /* Parallel summations */
1229 if (DOMAINDECOMP(cr))
1231 dd_atom_sum_real(cr->dd, work);
1234 if (gb_algorithm == egbHCT)
1237 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1239 if (born->use[i] != 0)
1241 rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1242 sum = 1.0/rr - work[i];
1243 min_rad = rr + doffset;
1246 born->bRad[i] = rad > min_rad ? rad : min_rad;
1247 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1251 /* Extra communication required for DD */
1252 if (DOMAINDECOMP(cr))
1254 dd_atom_spread_real(cr->dd, born->bRad);
1255 dd_atom_spread_real(cr->dd, fr->invsqrta);
1261 for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1263 if (born->use[i] != 0)
1265 rr = top->atomtypes.gb_radius[md->typeA[i]];
1273 tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1274 born->bRad[i] = rr_inv - tsum*rr_inv2;
1275 born->bRad[i] = 1.0 / born->bRad[i];
1277 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1279 tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1280 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1283 /* Extra (local) communication required for DD */
1284 if (DOMAINDECOMP(cr))
1286 dd_atom_spread_real(cr->dd, born->bRad);
1287 dd_atom_spread_real(cr->dd, fr->invsqrta);
1288 dd_atom_spread_real(cr->dd, born->drobc);
1299 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1300 float *x, float *f, float *fshift, float *shiftvec,
1301 int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1303 int i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1304 int jnrA, jnrB, jnrC, jnrD;
1305 int j3A, j3B, j3C, j3D;
1306 int jnrE, jnrF, jnrG, jnrH;
1307 int j3E, j3F, j3G, j3H;
1310 float rbi, shX, shY, shZ;
1315 __m128 jxB, jyB, jzB;
1316 __m128 fix, fiy, fiz;
1319 __m128 dxB, dyB, dzB;
1320 __m128 txB, tyB, tzB;
1322 __m128 rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1323 __m128 xmm1, xmm2, xmm3;
1325 const __m128 two = _mm_set1_ps(2.0f);
1331 /* Loop to get the proper form for the Born radius term, sse style */
1337 if (gb_algorithm == egbSTILL)
1339 for (i = n0; i < n1; i++)
1341 rbi = born->bRad[i];
1342 rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1345 else if (gb_algorithm == egbHCT)
1347 for (i = n0; i < n1; i++)
1349 rbi = born->bRad[i];
1350 rb[i] = rbi * rbi * dvda[i];
1353 else if (gb_algorithm == egbOBC)
1355 for (i = n0; i < n1; i++)
1357 rbi = born->bRad[i];
1358 rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1362 jz = _mm_setzero_ps();
1364 n = j3A = j3B = j3C = j3D = 0;
1366 for (i = 0; i < nl->nri; i++)
1370 is3 = 3*nl->shift[i];
1371 shX = shiftvec[is3];
1372 shY = shiftvec[is3+1];
1373 shZ = shiftvec[is3+2];
1374 nj0 = nl->jindex[i];
1375 nj1 = nl->jindex[i+1];
1377 ix = _mm_set1_ps(shX+x[ii3+0]);
1378 iy = _mm_set1_ps(shY+x[ii3+1]);
1379 iz = _mm_set1_ps(shZ+x[ii3+2]);
1381 offset = (nj1-nj0)%4;
1383 rbai = _mm_load1_ps(rb+ii);
1384 fix = _mm_setzero_ps();
1385 fiy = _mm_setzero_ps();
1386 fiz = _mm_setzero_ps();
1389 for (k = nj0; k < nj1-offset; k += 4)
1401 GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1403 dx = _mm_sub_ps(ix, jx);
1404 dy = _mm_sub_ps(iy, jy);
1405 dz = _mm_sub_ps(iz, jz);
1407 GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1409 /* load chain rule terms for j1-4 */
1410 f_gb = _mm_load_ps(dadx);
1412 f_gb_ai = _mm_load_ps(dadx);
1415 /* calculate scalar force */
1416 f_gb = _mm_mul_ps(f_gb, rbai);
1417 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1418 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1420 tx = _mm_mul_ps(f_gb, dx);
1421 ty = _mm_mul_ps(f_gb, dy);
1422 tz = _mm_mul_ps(f_gb, dz);
1424 fix = _mm_add_ps(fix, tx);
1425 fiy = _mm_add_ps(fiy, ty);
1426 fiz = _mm_add_ps(fiz, tz);
1428 GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1431 /*deal with odd elements */
1438 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1439 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1441 else if (offset == 2)
1447 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1448 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1452 /* offset must be 3 */
1459 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1460 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1463 dx = _mm_sub_ps(ix, jx);
1464 dy = _mm_sub_ps(iy, jy);
1465 dz = _mm_sub_ps(iz, jz);
1467 /* load chain rule terms for j1-4 */
1468 f_gb = _mm_load_ps(dadx);
1470 f_gb_ai = _mm_load_ps(dadx);
1473 /* calculate scalar force */
1474 f_gb = _mm_mul_ps(f_gb, rbai);
1475 f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1476 f_gb = _mm_add_ps(f_gb, f_gb_ai);
1478 tx = _mm_mul_ps(f_gb, dx);
1479 ty = _mm_mul_ps(f_gb, dy);
1480 tz = _mm_mul_ps(f_gb, dz);
1482 fix = _mm_add_ps(fix, tx);
1483 fiy = _mm_add_ps(fiy, ty);
1484 fiz = _mm_add_ps(fiz, tz);
1488 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1490 else if (offset == 2)
1492 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1496 /* offset must be 3 */
1497 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1501 /* fix/fiy/fiz now contain four partial force terms, that all should be
1502 * added to the i particle forces and shift forces.
1504 gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1512 /* keep compiler happy */
1513 int genborn_sse_dummy;
1515 #endif /* SSE intrinsics available */