f71b22075ae312d493a76b9c8ace8ad2e950074a
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "config.h"
40
41 #include <math.h>
42 #include <string.h>
43
44 #include "gromacs/legacyheaders/typedefs.h"
45 #include "gromacs/utility/smalloc.h"
46 #include "gromacs/legacyheaders/genborn.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/fileio/pdbio.h"
49 #include "gromacs/legacyheaders/names.h"
50 #include "gromacs/math/units.h"
51 #include "gromacs/legacyheaders/domdec.h"
52 #include "gromacs/legacyheaders/network.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "gromacs/legacyheaders/genborn.h"
55
56 #include "gromacs/utility/gmxmpi.h"
57
58
59 /* Only compile this file if SSE intrinsics are available */
60 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
61
62 #include <gmx_sse2_single.h>
63 #include <emmintrin.h>
64
65 #include "genborn_sse2_single.h"
66
67
68 int
69 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
70                               int natoms, gmx_localtop_t *top,
71                               float *x, t_nblist *nl,
72                               gmx_genborn_t *born)
73 {
74     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
75     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
76     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
77     int          shift;
78     int         *mdtype;
79     real         shX, shY, shZ;
80     int         *jjnr;
81     real        *shiftvec;
82
83     float        gpi_ai, gpi2;
84     float        factor;
85     float       *gb_radius;
86     float       *vsolv;
87     float       *work;
88     float       *dadx;
89
90     __m128       ix, iy, iz;
91     __m128       jx, jy, jz;
92     __m128       dx, dy, dz;
93     __m128       tx, ty, tz;
94     __m128       jxB, jyB, jzB;
95     __m128       dxB, dyB, dzB;
96     __m128       txB, tyB, tzB;
97     __m128       rsq, rinv, rinv2, rinv4, rinv6;
98     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
99     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
100     __m128       ratioB, rajB, vajB, rvdwB;
101     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
102     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
103     __m128       mask, icf4, icf6, mask_cmp;
104     __m128       icf4B, icf6B, mask_cmpB;
105
106     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
107     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
108     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
109
110     const __m128 half   = _mm_set1_ps(0.5f);
111     const __m128 three  = _mm_set1_ps(3.0f);
112     const __m128 one    = _mm_set1_ps(1.0f);
113     const __m128 two    = _mm_set1_ps(2.0f);
114     const __m128 zero   = _mm_set1_ps(0.0f);
115     const __m128 four   = _mm_set1_ps(4.0f);
116
117     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
118     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
119     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
120
121     factor  = 0.5 * ONE_4PI_EPS0;
122
123     gb_radius = born->gb_radius;
124     vsolv     = born->vsolv;
125     work      = born->gpol_still_work;
126     jjnr      = nl->jjnr;
127     shiftvec  = fr->shift_vec[0];
128     dadx      = fr->dadx;
129
130     jnrA = jnrB = jnrC = jnrD = 0;
131     jx   = _mm_setzero_ps();
132     jy   = _mm_setzero_ps();
133     jz   = _mm_setzero_ps();
134
135     n = 0;
136
137     for (i = 0; i < natoms; i++)
138     {
139         work[i] = 0;
140     }
141
142     for (i = 0; i < nl->nri; i++)
143     {
144         ii     = nl->iinr[i];
145         ii3    = ii*3;
146         is3    = 3*nl->shift[i];
147         shX    = shiftvec[is3];
148         shY    = shiftvec[is3+1];
149         shZ    = shiftvec[is3+2];
150         nj0    = nl->jindex[i];
151         nj1    = nl->jindex[i+1];
152
153         ix     = _mm_set1_ps(shX+x[ii3+0]);
154         iy     = _mm_set1_ps(shY+x[ii3+1]);
155         iz     = _mm_set1_ps(shZ+x[ii3+2]);
156
157         offset = (nj1-nj0)%4;
158
159         /* Polarization energy for atom ai */
160         gpi    = _mm_setzero_ps();
161
162         rai     = _mm_load1_ps(gb_radius+ii);
163         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
164
165         for (k = nj0; k < nj1-4-offset; k += 8)
166         {
167             jnrA        = jjnr[k];
168             jnrB        = jjnr[k+1];
169             jnrC        = jjnr[k+2];
170             jnrD        = jjnr[k+3];
171             jnrE        = jjnr[k+4];
172             jnrF        = jjnr[k+5];
173             jnrG        = jjnr[k+6];
174             jnrH        = jjnr[k+7];
175
176             j3A         = 3*jnrA;
177             j3B         = 3*jnrB;
178             j3C         = 3*jnrC;
179             j3D         = 3*jnrD;
180             j3E         = 3*jnrE;
181             j3F         = 3*jnrF;
182             j3G         = 3*jnrG;
183             j3H         = 3*jnrH;
184
185             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
186             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
187
188             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
189             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
190             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
191             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
192
193             dx          = _mm_sub_ps(ix, jx);
194             dy          = _mm_sub_ps(iy, jy);
195             dz          = _mm_sub_ps(iz, jz);
196             dxB         = _mm_sub_ps(ix, jxB);
197             dyB         = _mm_sub_ps(iy, jyB);
198             dzB         = _mm_sub_ps(iz, jzB);
199
200             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
201             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
202             rinv        = gmx_mm_invsqrt_ps(rsq);
203             rinvB       = gmx_mm_invsqrt_ps(rsqB);
204             rinv2       = _mm_mul_ps(rinv, rinv);
205             rinv2B      = _mm_mul_ps(rinvB, rinvB);
206             rinv4       = _mm_mul_ps(rinv2, rinv2);
207             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
208             rinv6       = _mm_mul_ps(rinv4, rinv2);
209             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
210
211             rvdw        = _mm_add_ps(rai, raj);
212             rvdwB       = _mm_add_ps(rai, rajB);
213             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
214             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
215
216             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
217             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
218
219             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
220             if (0 == _mm_movemask_ps(mask_cmp) )
221             {
222                 /* if ratio>still_p5inv for ALL elements */
223                 ccf         = one;
224                 dccf        = _mm_setzero_ps();
225             }
226             else
227             {
228                 ratio       = _mm_min_ps(ratio, still_p5inv);
229                 theta       = _mm_mul_ps(ratio, still_pip5);
230                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
231                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
232                 ccf         = _mm_mul_ps(term, term);
233                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
234                                          _mm_mul_ps(sinq, theta));
235             }
236             if (0 == _mm_movemask_ps(mask_cmpB) )
237             {
238                 /* if ratio>still_p5inv for ALL elements */
239                 ccfB        = one;
240                 dccfB       = _mm_setzero_ps();
241             }
242             else
243             {
244                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
245                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
246                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
247                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
248                 ccfB        = _mm_mul_ps(termB, termB);
249                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
250                                          _mm_mul_ps(sinqB, thetaB));
251             }
252
253             prod        = _mm_mul_ps(still_p4, vaj);
254             prodB       = _mm_mul_ps(still_p4, vajB);
255             icf4        = _mm_mul_ps(ccf, rinv4);
256             icf4B       = _mm_mul_ps(ccfB, rinv4B);
257             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
258             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
259
260             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
261             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
262
263             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
264
265             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
266             dadx += 4;
267             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
268             dadx += 4;
269             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
270             dadx += 4;
271             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
272             dadx += 4;
273         }
274
275         for (; k < nj1-offset; k += 4)
276         {
277             jnrA        = jjnr[k];
278             jnrB        = jjnr[k+1];
279             jnrC        = jjnr[k+2];
280             jnrD        = jjnr[k+3];
281
282             j3A         = 3*jnrA;
283             j3B         = 3*jnrB;
284             j3C         = 3*jnrC;
285             j3D         = 3*jnrD;
286
287             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
288
289             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
290             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
291
292             dx          = _mm_sub_ps(ix, jx);
293             dy          = _mm_sub_ps(iy, jy);
294             dz          = _mm_sub_ps(iz, jz);
295
296             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
297             rinv        = gmx_mm_invsqrt_ps(rsq);
298             rinv2       = _mm_mul_ps(rinv, rinv);
299             rinv4       = _mm_mul_ps(rinv2, rinv2);
300             rinv6       = _mm_mul_ps(rinv4, rinv2);
301
302             rvdw        = _mm_add_ps(rai, raj);
303             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
304
305             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
306
307             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
308             if (0 == _mm_movemask_ps(mask_cmp))
309             {
310                 /* if ratio>still_p5inv for ALL elements */
311                 ccf         = one;
312                 dccf        = _mm_setzero_ps();
313             }
314             else
315             {
316                 ratio       = _mm_min_ps(ratio, still_p5inv);
317                 theta       = _mm_mul_ps(ratio, still_pip5);
318                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
319                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
320                 ccf         = _mm_mul_ps(term, term);
321                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
322                                          _mm_mul_ps(sinq, theta));
323             }
324
325             prod        = _mm_mul_ps(still_p4, vaj);
326             icf4        = _mm_mul_ps(ccf, rinv4);
327             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
328
329             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
330
331             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
332
333             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
334             dadx += 4;
335             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
336             dadx += 4;
337         }
338
339         if (offset != 0)
340         {
341             if (offset == 1)
342             {
343                 jnrA        = jjnr[k];
344                 j3A         = 3*jnrA;
345                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
346                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
347                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
348                 mask        = mask1;
349             }
350             else if (offset == 2)
351             {
352                 jnrA        = jjnr[k];
353                 jnrB        = jjnr[k+1];
354                 j3A         = 3*jnrA;
355                 j3B         = 3*jnrB;
356                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
357                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
358                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
359                 mask        = mask2;
360             }
361             else
362             {
363                 /* offset must be 3 */
364                 jnrA        = jjnr[k];
365                 jnrB        = jjnr[k+1];
366                 jnrC        = jjnr[k+2];
367                 j3A         = 3*jnrA;
368                 j3B         = 3*jnrB;
369                 j3C         = 3*jnrC;
370                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
371                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
372                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
373                 mask        = mask3;
374             }
375
376             dx          = _mm_sub_ps(ix, jx);
377             dy          = _mm_sub_ps(iy, jy);
378             dz          = _mm_sub_ps(iz, jz);
379
380             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
381             rinv        = gmx_mm_invsqrt_ps(rsq);
382             rinv2       = _mm_mul_ps(rinv, rinv);
383             rinv4       = _mm_mul_ps(rinv2, rinv2);
384             rinv6       = _mm_mul_ps(rinv4, rinv2);
385
386             rvdw        = _mm_add_ps(rai, raj);
387             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
388
389             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
390
391             if (0 == _mm_movemask_ps(mask_cmp))
392             {
393                 /* if ratio>still_p5inv for ALL elements */
394                 ccf         = one;
395                 dccf        = _mm_setzero_ps();
396             }
397             else
398             {
399                 ratio       = _mm_min_ps(ratio, still_p5inv);
400                 theta       = _mm_mul_ps(ratio, still_pip5);
401                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
402                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
403                 ccf         = _mm_mul_ps(term, term);
404                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
405                                          _mm_mul_ps(sinq, theta));
406             }
407
408             prod        = _mm_mul_ps(still_p4, vaj);
409             icf4        = _mm_mul_ps(ccf, rinv4);
410             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
411
412             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
413
414             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
415             dadx += 4;
416             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
417             dadx += 4;
418
419             tmp = _mm_mul_ps(prod_ai, icf4);
420
421             if (offset == 1)
422             {
423                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
424             }
425             else if (offset == 2)
426             {
427                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
428             }
429             else
430             {
431                 /* offset must be 3 */
432                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
433             }
434         }
435         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
436     }
437
438     /* Sum up the polarization energy from other nodes */
439     if (DOMAINDECOMP(cr))
440     {
441         dd_atom_sum_real(cr->dd, work);
442     }
443
444     /* Compute the radii */
445     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
446     {
447         if (born->use[i] != 0)
448         {
449             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
450             gpi2             = gpi_ai * gpi_ai;
451             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
452             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
453         }
454     }
455
456     /* Extra (local) communication required for DD */
457     if (DOMAINDECOMP(cr))
458     {
459         dd_atom_spread_real(cr->dd, born->bRad);
460         dd_atom_spread_real(cr->dd, fr->invsqrta);
461     }
462
463     return 0;
464 }
465
466
467 int
468 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
469                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
470 {
471     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
472     int          jnrA, jnrB, jnrC, jnrD;
473     int          j3A, j3B, j3C, j3D;
474     int          jnrE, jnrF, jnrG, jnrH;
475     int          j3E, j3F, j3G, j3H;
476     float        shX, shY, shZ;
477     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
478     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
479     float       *obc_param;
480     float       *gb_radius;
481     float       *work;
482     int       *  jjnr;
483     float       *dadx;
484     float       *shiftvec;
485     float        min_rad, rad;
486
487     __m128       ix, iy, iz, jx, jy, jz;
488     __m128       dx, dy, dz, t1, t2, t3, t4;
489     __m128       rsq, rinv, r;
490     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
491     __m128       uij, lij2, uij2, lij3, uij3, diff2;
492     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
493     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
494     __m128       dadx1, dadx2;
495     __m128       logterm;
496     __m128       mask;
497     __m128       obc_mask1, obc_mask2, obc_mask3;
498     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
499     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
500     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
501     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
502     __m128       lij_invB, sk2_invB, prodB;
503     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
504     __m128       dadx1B, dadx2B;
505     __m128       logtermB;
506     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
507
508     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
509     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
510     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
511
512     __m128       oneeighth   = _mm_set1_ps(0.125);
513     __m128       onefourth   = _mm_set1_ps(0.25);
514
515     const __m128 half  = _mm_set1_ps(0.5f);
516     const __m128 three = _mm_set1_ps(3.0f);
517     const __m128 one   = _mm_set1_ps(1.0f);
518     const __m128 two   = _mm_set1_ps(2.0f);
519     const __m128 zero  = _mm_set1_ps(0.0f);
520     const __m128 neg   = _mm_set1_ps(-1.0f);
521
522     /* Set the dielectric offset */
523     doffset   = born->gb_doffset;
524     gb_radius = born->gb_radius;
525     obc_param = born->param;
526     work      = born->gpol_hct_work;
527     jjnr      = nl->jjnr;
528     dadx      = fr->dadx;
529     shiftvec  = fr->shift_vec[0];
530
531     jx        = _mm_setzero_ps();
532     jy        = _mm_setzero_ps();
533     jz        = _mm_setzero_ps();
534
535     jnrA = jnrB = jnrC = jnrD = 0;
536
537     for (i = 0; i < born->nr; i++)
538     {
539         work[i] = 0;
540     }
541
542     for (i = 0; i < nl->nri; i++)
543     {
544         ii     = nl->iinr[i];
545         ii3    = ii*3;
546         is3    = 3*nl->shift[i];
547         shX    = shiftvec[is3];
548         shY    = shiftvec[is3+1];
549         shZ    = shiftvec[is3+2];
550         nj0    = nl->jindex[i];
551         nj1    = nl->jindex[i+1];
552
553         ix     = _mm_set1_ps(shX+x[ii3+0]);
554         iy     = _mm_set1_ps(shY+x[ii3+1]);
555         iz     = _mm_set1_ps(shZ+x[ii3+2]);
556
557         offset = (nj1-nj0)%4;
558
559         rai     = _mm_load1_ps(gb_radius+ii);
560         rai_inv = gmx_mm_inv_ps(rai);
561
562         sum_ai = _mm_setzero_ps();
563
564         sk_ai  = _mm_load1_ps(born->param+ii);
565         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
566
567         for (k = nj0; k < nj1-4-offset; k += 8)
568         {
569             jnrA        = jjnr[k];
570             jnrB        = jjnr[k+1];
571             jnrC        = jjnr[k+2];
572             jnrD        = jjnr[k+3];
573             jnrE        = jjnr[k+4];
574             jnrF        = jjnr[k+5];
575             jnrG        = jjnr[k+6];
576             jnrH        = jjnr[k+7];
577
578             j3A         = 3*jnrA;
579             j3B         = 3*jnrB;
580             j3C         = 3*jnrC;
581             j3D         = 3*jnrD;
582             j3E         = 3*jnrE;
583             j3F         = 3*jnrF;
584             j3G         = 3*jnrG;
585             j3H         = 3*jnrH;
586
587             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
588             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
589             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
590             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
591             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
592             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
593
594             dx    = _mm_sub_ps(ix, jx);
595             dy    = _mm_sub_ps(iy, jy);
596             dz    = _mm_sub_ps(iz, jz);
597             dxB   = _mm_sub_ps(ix, jxB);
598             dyB   = _mm_sub_ps(iy, jyB);
599             dzB   = _mm_sub_ps(iz, jzB);
600
601             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
602             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
603
604             rinv        = gmx_mm_invsqrt_ps(rsq);
605             r           = _mm_mul_ps(rsq, rinv);
606             rinvB       = gmx_mm_invsqrt_ps(rsqB);
607             rB          = _mm_mul_ps(rsqB, rinvB);
608
609             /* Compute raj_inv aj1-4 */
610             raj_inv     = gmx_mm_inv_ps(raj);
611             raj_invB    = gmx_mm_inv_ps(rajB);
612
613             /* Evaluate influence of atom aj -> ai */
614             t1            = _mm_add_ps(r, sk_aj);
615             t2            = _mm_sub_ps(r, sk_aj);
616             t3            = _mm_sub_ps(sk_aj, r);
617             t1B           = _mm_add_ps(rB, sk_ajB);
618             t2B           = _mm_sub_ps(rB, sk_ajB);
619             t3B           = _mm_sub_ps(sk_ajB, rB);
620             obc_mask1     = _mm_cmplt_ps(rai, t1);
621             obc_mask2     = _mm_cmplt_ps(rai, t2);
622             obc_mask3     = _mm_cmplt_ps(rai, t3);
623             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
624             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
625             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
626
627             uij           = gmx_mm_inv_ps(t1);
628             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
629                                          _mm_andnot_ps(obc_mask2, rai_inv));
630             dlij          = _mm_and_ps(one, obc_mask2);
631             uij2          = _mm_mul_ps(uij, uij);
632             uij3          = _mm_mul_ps(uij2, uij);
633             lij2          = _mm_mul_ps(lij, lij);
634             lij3          = _mm_mul_ps(lij2, lij);
635
636             uijB          = gmx_mm_inv_ps(t1B);
637             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
638                                          _mm_andnot_ps(obc_mask2B, rai_inv));
639             dlijB         = _mm_and_ps(one, obc_mask2B);
640             uij2B         = _mm_mul_ps(uijB, uijB);
641             uij3B         = _mm_mul_ps(uij2B, uijB);
642             lij2B         = _mm_mul_ps(lijB, lijB);
643             lij3B         = _mm_mul_ps(lij2B, lijB);
644
645             diff2         = _mm_sub_ps(uij2, lij2);
646             lij_inv       = gmx_mm_invsqrt_ps(lij2);
647             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
648             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
649             prod          = _mm_mul_ps(onefourth, sk2_rinv);
650
651             diff2B        = _mm_sub_ps(uij2B, lij2B);
652             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
653             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
654             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
655             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
656
657             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
658             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
659
660             t1            = _mm_sub_ps(lij, uij);
661             t2            = _mm_mul_ps(diff2,
662                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
663                                                   prod));
664             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
665             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
666             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
667             t4            = _mm_and_ps(t4, obc_mask3);
668             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
669
670             t1B           = _mm_sub_ps(lijB, uijB);
671             t2B           = _mm_mul_ps(diff2B,
672                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
673                                                   prodB));
674             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
675             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
676             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
677             t4B           = _mm_and_ps(t4B, obc_mask3B);
678             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
679
680             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
681
682             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
683                                        _mm_mul_ps(prod, lij3));
684             t1            = _mm_sub_ps(t1,
685                                        _mm_mul_ps(onefourth,
686                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
687                                                              _mm_mul_ps(lij3, r))));
688             t2            = _mm_mul_ps(onefourth,
689                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
690                                                   _mm_mul_ps(uij3, r)));
691             t2            = _mm_sub_ps(t2,
692                                        _mm_add_ps(_mm_mul_ps(half, uij2),
693                                                   _mm_mul_ps(prod, uij3)));
694             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
695                                        _mm_mul_ps(rinv, rinv));
696             t3            = _mm_sub_ps(t3,
697                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
698                                                   _mm_add_ps(one,
699                                                              _mm_mul_ps(sk2_rinv, rinv))));
700             t1            = _mm_mul_ps(rinv,
701                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
702                                                   _mm_add_ps(t2, t3)));
703
704
705
706             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
707                                        _mm_mul_ps(prodB, lij3B));
708             t1B           = _mm_sub_ps(t1B,
709                                        _mm_mul_ps(onefourth,
710                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
711                                                              _mm_mul_ps(lij3B, rB))));
712             t2B           = _mm_mul_ps(onefourth,
713                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
714                                                   _mm_mul_ps(uij3B, rB)));
715             t2B           = _mm_sub_ps(t2B,
716                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
717                                                   _mm_mul_ps(prodB, uij3B)));
718             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
719                                        _mm_mul_ps(rinvB, rinvB));
720             t3B           = _mm_sub_ps(t3B,
721                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
722                                                   _mm_add_ps(one,
723                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
724             t1B           = _mm_mul_ps(rinvB,
725                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
726                                                   _mm_add_ps(t2B, t3B)));
727
728             dadx1         = _mm_and_ps(t1, obc_mask1);
729             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
730
731
732             /* Evaluate influence of atom ai -> aj */
733             t1            = _mm_add_ps(r, sk_ai);
734             t2            = _mm_sub_ps(r, sk_ai);
735             t3            = _mm_sub_ps(sk_ai, r);
736             t1B           = _mm_add_ps(rB, sk_ai);
737             t2B           = _mm_sub_ps(rB, sk_ai);
738             t3B           = _mm_sub_ps(sk_ai, rB);
739             obc_mask1     = _mm_cmplt_ps(raj, t1);
740             obc_mask2     = _mm_cmplt_ps(raj, t2);
741             obc_mask3     = _mm_cmplt_ps(raj, t3);
742             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
743             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
744             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
745
746             uij           = gmx_mm_inv_ps(t1);
747             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
748                                          _mm_andnot_ps(obc_mask2, raj_inv));
749             dlij          = _mm_and_ps(one, obc_mask2);
750             uij2          = _mm_mul_ps(uij, uij);
751             uij3          = _mm_mul_ps(uij2, uij);
752             lij2          = _mm_mul_ps(lij, lij);
753             lij3          = _mm_mul_ps(lij2, lij);
754
755             uijB          = gmx_mm_inv_ps(t1B);
756             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
757                                          _mm_andnot_ps(obc_mask2B, raj_invB));
758             dlijB         = _mm_and_ps(one, obc_mask2B);
759             uij2B         = _mm_mul_ps(uijB, uijB);
760             uij3B         = _mm_mul_ps(uij2B, uijB);
761             lij2B         = _mm_mul_ps(lijB, lijB);
762             lij3B         = _mm_mul_ps(lij2B, lijB);
763
764             diff2         = _mm_sub_ps(uij2, lij2);
765             lij_inv       = gmx_mm_invsqrt_ps(lij2);
766             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
767             prod          = _mm_mul_ps(onefourth, sk2_rinv);
768
769             diff2B        = _mm_sub_ps(uij2B, lij2B);
770             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
771             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
772             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
773
774             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
775             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
776
777             t1            = _mm_sub_ps(lij, uij);
778             t2            = _mm_mul_ps(diff2,
779                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
780                                                   prod));
781             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
782             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
783             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
784             t4            = _mm_and_ps(t4, obc_mask3);
785             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
786
787             t1B           = _mm_sub_ps(lijB, uijB);
788             t2B           = _mm_mul_ps(diff2B,
789                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
790                                                   prodB));
791             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
792             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
793             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
794             t4B           = _mm_and_ps(t4B, obc_mask3B);
795             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
796
797             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
798             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
799
800             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
801                                        _mm_mul_ps(prod, lij3));
802             t1            = _mm_sub_ps(t1,
803                                        _mm_mul_ps(onefourth,
804                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
805                                                              _mm_mul_ps(lij3, r))));
806             t2            = _mm_mul_ps(onefourth,
807                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
808                                                   _mm_mul_ps(uij3, r)));
809             t2            = _mm_sub_ps(t2,
810                                        _mm_add_ps(_mm_mul_ps(half, uij2),
811                                                   _mm_mul_ps(prod, uij3)));
812             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
813                                        _mm_mul_ps(rinv, rinv));
814             t3            = _mm_sub_ps(t3,
815                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
816                                                   _mm_add_ps(one,
817                                                              _mm_mul_ps(sk2_rinv, rinv))));
818             t1            = _mm_mul_ps(rinv,
819                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
820                                                   _mm_add_ps(t2, t3)));
821
822
823             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
824                                        _mm_mul_ps(prodB, lij3B));
825             t1B           = _mm_sub_ps(t1B,
826                                        _mm_mul_ps(onefourth,
827                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
828                                                              _mm_mul_ps(lij3B, rB))));
829             t2B           = _mm_mul_ps(onefourth,
830                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
831                                                   _mm_mul_ps(uij3B, rB)));
832             t2B           = _mm_sub_ps(t2B,
833                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
834                                                   _mm_mul_ps(prodB, uij3B)));
835             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
836                                        _mm_mul_ps(rinvB, rinvB));
837             t3B           = _mm_sub_ps(t3B,
838                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
839                                                   _mm_add_ps(one,
840                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
841             t1B           = _mm_mul_ps(rinvB,
842                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
843                                                   _mm_add_ps(t2B, t3B)));
844
845
846             dadx2         = _mm_and_ps(t1, obc_mask1);
847             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
848
849             _mm_store_ps(dadx, dadx1);
850             dadx += 4;
851             _mm_store_ps(dadx, dadx2);
852             dadx += 4;
853             _mm_store_ps(dadx, dadx1B);
854             dadx += 4;
855             _mm_store_ps(dadx, dadx2B);
856             dadx += 4;
857
858         } /* end normal inner loop */
859
860         for (; k < nj1-offset; k += 4)
861         {
862             jnrA        = jjnr[k];
863             jnrB        = jjnr[k+1];
864             jnrC        = jjnr[k+2];
865             jnrD        = jjnr[k+3];
866
867             j3A         = 3*jnrA;
868             j3B         = 3*jnrB;
869             j3C         = 3*jnrC;
870             j3D         = 3*jnrD;
871
872             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
873             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
874             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
875
876             dx    = _mm_sub_ps(ix, jx);
877             dy    = _mm_sub_ps(iy, jy);
878             dz    = _mm_sub_ps(iz, jz);
879
880             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
881
882             rinv        = gmx_mm_invsqrt_ps(rsq);
883             r           = _mm_mul_ps(rsq, rinv);
884
885             /* Compute raj_inv aj1-4 */
886             raj_inv     = gmx_mm_inv_ps(raj);
887
888             /* Evaluate influence of atom aj -> ai */
889             t1            = _mm_add_ps(r, sk_aj);
890             obc_mask1     = _mm_cmplt_ps(rai, t1);
891
892             if (_mm_movemask_ps(obc_mask1))
893             {
894                 /* If any of the elements has rai<dr+sk, this is executed */
895                 t2            = _mm_sub_ps(r, sk_aj);
896                 t3            = _mm_sub_ps(sk_aj, r);
897
898                 obc_mask2     = _mm_cmplt_ps(rai, t2);
899                 obc_mask3     = _mm_cmplt_ps(rai, t3);
900
901                 uij           = gmx_mm_inv_ps(t1);
902                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
903                                              _mm_andnot_ps(obc_mask2, rai_inv));
904                 dlij          = _mm_and_ps(one, obc_mask2);
905                 uij2          = _mm_mul_ps(uij, uij);
906                 uij3          = _mm_mul_ps(uij2, uij);
907                 lij2          = _mm_mul_ps(lij, lij);
908                 lij3          = _mm_mul_ps(lij2, lij);
909                 diff2         = _mm_sub_ps(uij2, lij2);
910                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
911                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
912                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
913                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
914                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
915                 t1            = _mm_sub_ps(lij, uij);
916                 t2            = _mm_mul_ps(diff2,
917                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
918                                                       prod));
919                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
920                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
921                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
922                 t4            = _mm_and_ps(t4, obc_mask3);
923                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
924                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
925                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
926                                            _mm_mul_ps(prod, lij3));
927                 t1            = _mm_sub_ps(t1,
928                                            _mm_mul_ps(onefourth,
929                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
930                                                                  _mm_mul_ps(lij3, r))));
931                 t2            = _mm_mul_ps(onefourth,
932                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
933                                                       _mm_mul_ps(uij3, r)));
934                 t2            = _mm_sub_ps(t2,
935                                            _mm_add_ps(_mm_mul_ps(half, uij2),
936                                                       _mm_mul_ps(prod, uij3)));
937                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
938                                            _mm_mul_ps(rinv, rinv));
939                 t3            = _mm_sub_ps(t3,
940                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
941                                                       _mm_add_ps(one,
942                                                                  _mm_mul_ps(sk2_rinv, rinv))));
943                 t1            = _mm_mul_ps(rinv,
944                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
945                                                       _mm_add_ps(t2, t3)));
946
947                 dadx1         = _mm_and_ps(t1, obc_mask1);
948             }
949             else
950             {
951                 dadx1         = _mm_setzero_ps();
952             }
953
954             /* Evaluate influence of atom ai -> aj */
955             t1            = _mm_add_ps(r, sk_ai);
956             obc_mask1     = _mm_cmplt_ps(raj, t1);
957
958             if (_mm_movemask_ps(obc_mask1))
959             {
960                 t2            = _mm_sub_ps(r, sk_ai);
961                 t3            = _mm_sub_ps(sk_ai, r);
962                 obc_mask2     = _mm_cmplt_ps(raj, t2);
963                 obc_mask3     = _mm_cmplt_ps(raj, t3);
964
965                 uij           = gmx_mm_inv_ps(t1);
966                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
967                                              _mm_andnot_ps(obc_mask2, raj_inv));
968                 dlij          = _mm_and_ps(one, obc_mask2);
969                 uij2          = _mm_mul_ps(uij, uij);
970                 uij3          = _mm_mul_ps(uij2, uij);
971                 lij2          = _mm_mul_ps(lij, lij);
972                 lij3          = _mm_mul_ps(lij2, lij);
973                 diff2         = _mm_sub_ps(uij2, lij2);
974                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
975                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
976                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
977                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
978                 t1            = _mm_sub_ps(lij, uij);
979                 t2            = _mm_mul_ps(diff2,
980                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
981                                                       prod));
982                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
983                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
984                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
985                 t4            = _mm_and_ps(t4, obc_mask3);
986                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
987
988                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
989
990                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
991                                            _mm_mul_ps(prod, lij3));
992                 t1            = _mm_sub_ps(t1,
993                                            _mm_mul_ps(onefourth,
994                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
995                                                                  _mm_mul_ps(lij3, r))));
996                 t2            = _mm_mul_ps(onefourth,
997                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
998                                                       _mm_mul_ps(uij3, r)));
999                 t2            = _mm_sub_ps(t2,
1000                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1001                                                       _mm_mul_ps(prod, uij3)));
1002                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1003                                            _mm_mul_ps(rinv, rinv));
1004                 t3            = _mm_sub_ps(t3,
1005                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1006                                                       _mm_add_ps(one,
1007                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1008                 t1            = _mm_mul_ps(rinv,
1009                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1010                                                       _mm_add_ps(t2, t3)));
1011                 dadx2         = _mm_and_ps(t1, obc_mask1);
1012             }
1013             else
1014             {
1015                 dadx2         = _mm_setzero_ps();
1016             }
1017
1018             _mm_store_ps(dadx, dadx1);
1019             dadx += 4;
1020             _mm_store_ps(dadx, dadx2);
1021             dadx += 4;
1022         } /* end normal inner loop */
1023
1024         if (offset != 0)
1025         {
1026             if (offset == 1)
1027             {
1028                 jnrA        = jjnr[k];
1029                 j3A         = 3*jnrA;
1030                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1031                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1032                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1033                 mask        = mask1;
1034             }
1035             else if (offset == 2)
1036             {
1037                 jnrA        = jjnr[k];
1038                 jnrB        = jjnr[k+1];
1039                 j3A         = 3*jnrA;
1040                 j3B         = 3*jnrB;
1041                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1042                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1043                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1044                 mask        = mask2;
1045             }
1046             else
1047             {
1048                 /* offset must be 3 */
1049                 jnrA        = jjnr[k];
1050                 jnrB        = jjnr[k+1];
1051                 jnrC        = jjnr[k+2];
1052                 j3A         = 3*jnrA;
1053                 j3B         = 3*jnrB;
1054                 j3C         = 3*jnrC;
1055                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1056                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1057                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1058                 mask        = mask3;
1059             }
1060
1061             dx    = _mm_sub_ps(ix, jx);
1062             dy    = _mm_sub_ps(iy, jy);
1063             dz    = _mm_sub_ps(iz, jz);
1064
1065             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1066
1067             rinv        = gmx_mm_invsqrt_ps(rsq);
1068             r           = _mm_mul_ps(rsq, rinv);
1069
1070             /* Compute raj_inv aj1-4 */
1071             raj_inv     = gmx_mm_inv_ps(raj);
1072
1073             /* Evaluate influence of atom aj -> ai */
1074             t1            = _mm_add_ps(r, sk_aj);
1075             obc_mask1     = _mm_cmplt_ps(rai, t1);
1076             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1077
1078             if (_mm_movemask_ps(obc_mask1))
1079             {
1080                 t2            = _mm_sub_ps(r, sk_aj);
1081                 t3            = _mm_sub_ps(sk_aj, r);
1082                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1083                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1084
1085                 uij           = gmx_mm_inv_ps(t1);
1086                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1087                                              _mm_andnot_ps(obc_mask2, rai_inv));
1088                 dlij           = _mm_and_ps(one, obc_mask2);
1089                 uij2           = _mm_mul_ps(uij, uij);
1090                 uij3           = _mm_mul_ps(uij2, uij);
1091                 lij2           = _mm_mul_ps(lij, lij);
1092                 lij3           = _mm_mul_ps(lij2, lij);
1093                 diff2          = _mm_sub_ps(uij2, lij2);
1094                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1095                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1096                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1097                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1098                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1099                 t1             = _mm_sub_ps(lij, uij);
1100                 t2             = _mm_mul_ps(diff2,
1101                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1102                                                        prod));
1103                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1104                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1105                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1106                 t4            = _mm_and_ps(t4, obc_mask3);
1107                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1108                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1109                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1110                                            _mm_mul_ps(prod, lij3));
1111                 t1            = _mm_sub_ps(t1,
1112                                            _mm_mul_ps(onefourth,
1113                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1114                                                                  _mm_mul_ps(lij3, r))));
1115                 t2            = _mm_mul_ps(onefourth,
1116                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1117                                                       _mm_mul_ps(uij3, r)));
1118                 t2            = _mm_sub_ps(t2,
1119                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1120                                                       _mm_mul_ps(prod, uij3)));
1121                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1122                                            _mm_mul_ps(rinv, rinv));
1123                 t3            = _mm_sub_ps(t3,
1124                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1125                                                       _mm_add_ps(one,
1126                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1127                 t1            = _mm_mul_ps(rinv,
1128                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1129                                                       _mm_add_ps(t2, t3)));
1130                 dadx1         = _mm_and_ps(t1, obc_mask1);
1131             }
1132             else
1133             {
1134                 dadx1         = _mm_setzero_ps();
1135             }
1136
1137             /* Evaluate influence of atom ai -> aj */
1138             t1            = _mm_add_ps(r, sk_ai);
1139             obc_mask1     = _mm_cmplt_ps(raj, t1);
1140             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1141
1142             if (_mm_movemask_ps(obc_mask1))
1143             {
1144                 t2            = _mm_sub_ps(r, sk_ai);
1145                 t3            = _mm_sub_ps(sk_ai, r);
1146                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1147                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1148
1149                 uij           = gmx_mm_inv_ps(t1);
1150                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1151                                           _mm_andnot_ps(obc_mask2, raj_inv));
1152                 dlij          = _mm_and_ps(one, obc_mask2);
1153                 uij2          = _mm_mul_ps(uij, uij);
1154                 uij3          = _mm_mul_ps(uij2, uij);
1155                 lij2          = _mm_mul_ps(lij, lij);
1156                 lij3          = _mm_mul_ps(lij2, lij);
1157                 diff2         = _mm_sub_ps(uij2, lij2);
1158                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1159                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1160                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1161                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1162                 t1            = _mm_sub_ps(lij, uij);
1163                 t2            = _mm_mul_ps(diff2,
1164                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1165                                                       prod));
1166                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1167                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1168                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1169                 t4            = _mm_and_ps(t4, obc_mask3);
1170                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1171
1172                 tmp           = _mm_and_ps(t1, obc_mask1);
1173
1174                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1175                                            _mm_mul_ps(prod, lij3));
1176                 t1            = _mm_sub_ps(t1,
1177                                            _mm_mul_ps(onefourth,
1178                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1179                                                                  _mm_mul_ps(lij3, r))));
1180                 t2            = _mm_mul_ps(onefourth,
1181                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1182                                                       _mm_mul_ps(uij3, r)));
1183                 t2            = _mm_sub_ps(t2,
1184                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1185                                                       _mm_mul_ps(prod, uij3)));
1186                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1187                                            _mm_mul_ps(rinv, rinv));
1188                 t3            = _mm_sub_ps(t3,
1189                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1190                                                       _mm_add_ps(one,
1191                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1192                 t1            = _mm_mul_ps(rinv,
1193                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1194                                                       _mm_add_ps(t2, t3)));
1195                 dadx2         = _mm_and_ps(t1, obc_mask1);
1196             }
1197             else
1198             {
1199                 dadx2         = _mm_setzero_ps();
1200                 tmp           = _mm_setzero_ps();
1201             }
1202
1203             _mm_store_ps(dadx, dadx1);
1204             dadx += 4;
1205             _mm_store_ps(dadx, dadx2);
1206             dadx += 4;
1207
1208             if (offset == 1)
1209             {
1210                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1211             }
1212             else if (offset == 2)
1213             {
1214                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1215             }
1216             else
1217             {
1218                 /* offset must be 3 */
1219                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1220             }
1221
1222         }
1223         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1224
1225     }
1226
1227     /* Parallel summations */
1228     if (DOMAINDECOMP(cr))
1229     {
1230         dd_atom_sum_real(cr->dd, work);
1231     }
1232
1233     if (gb_algorithm == egbHCT)
1234     {
1235         /* HCT */
1236         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1237         {
1238             if (born->use[i] != 0)
1239             {
1240                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1241                 sum     = 1.0/rr - work[i];
1242                 min_rad = rr + doffset;
1243                 rad     = 1.0/sum;
1244
1245                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1246                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1247             }
1248         }
1249
1250         /* Extra communication required for DD */
1251         if (DOMAINDECOMP(cr))
1252         {
1253             dd_atom_spread_real(cr->dd, born->bRad);
1254             dd_atom_spread_real(cr->dd, fr->invsqrta);
1255         }
1256     }
1257     else
1258     {
1259         /* OBC */
1260         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1261         {
1262             if (born->use[i] != 0)
1263             {
1264                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1265                 rr_inv2 = 1.0/rr;
1266                 rr      = rr-doffset;
1267                 rr_inv  = 1.0/rr;
1268                 sum     = rr * work[i];
1269                 sum2    = sum  * sum;
1270                 sum3    = sum2 * sum;
1271
1272                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1273                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1274                 born->bRad[i] = 1.0 / born->bRad[i];
1275
1276                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1277
1278                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1279                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1280             }
1281         }
1282         /* Extra (local) communication required for DD */
1283         if (DOMAINDECOMP(cr))
1284         {
1285             dd_atom_spread_real(cr->dd, born->bRad);
1286             dd_atom_spread_real(cr->dd, fr->invsqrta);
1287             dd_atom_spread_real(cr->dd, born->drobc);
1288         }
1289     }
1290
1291
1292
1293     return 0;
1294 }
1295
1296
1297
1298 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1299                                     float *x, float *f, float *fshift, float *shiftvec,
1300                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1301 {
1302     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1303     int          jnrA, jnrB, jnrC, jnrD;
1304     int          j3A, j3B, j3C, j3D;
1305     int          jnrE, jnrF, jnrG, jnrH;
1306     int          j3E, j3F, j3G, j3H;
1307     int       *  jjnr;
1308
1309     float        rbi, shX, shY, shZ;
1310     float       *rb;
1311
1312     __m128       ix, iy, iz;
1313     __m128       jx, jy, jz;
1314     __m128       jxB, jyB, jzB;
1315     __m128       fix, fiy, fiz;
1316     __m128       dx, dy, dz;
1317     __m128       tx, ty, tz;
1318     __m128       dxB, dyB, dzB;
1319     __m128       txB, tyB, tzB;
1320
1321     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1322     __m128       xmm1, xmm2, xmm3;
1323
1324     const __m128 two = _mm_set1_ps(2.0f);
1325
1326     rb     = born->work;
1327
1328     jjnr   = nl->jjnr;
1329
1330     /* Loop to get the proper form for the Born radius term, sse style */
1331     offset = natoms%4;
1332
1333     n0 = 0;
1334     n1 = natoms;
1335
1336     if (gb_algorithm == egbSTILL)
1337     {
1338         for (i = n0; i < n1; i++)
1339         {
1340             rbi   = born->bRad[i];
1341             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1342         }
1343     }
1344     else if (gb_algorithm == egbHCT)
1345     {
1346         for (i = n0; i < n1; i++)
1347         {
1348             rbi   = born->bRad[i];
1349             rb[i] = rbi * rbi * dvda[i];
1350         }
1351     }
1352     else if (gb_algorithm == egbOBC)
1353     {
1354         for (i = n0; i < n1; i++)
1355         {
1356             rbi   = born->bRad[i];
1357             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1358         }
1359     }
1360
1361     jz = _mm_setzero_ps();
1362
1363     n = j3A = j3B = j3C = j3D = 0;
1364
1365     for (i = 0; i < nl->nri; i++)
1366     {
1367         ii     = nl->iinr[i];
1368         ii3    = ii*3;
1369         is3    = 3*nl->shift[i];
1370         shX    = shiftvec[is3];
1371         shY    = shiftvec[is3+1];
1372         shZ    = shiftvec[is3+2];
1373         nj0    = nl->jindex[i];
1374         nj1    = nl->jindex[i+1];
1375
1376         ix     = _mm_set1_ps(shX+x[ii3+0]);
1377         iy     = _mm_set1_ps(shY+x[ii3+1]);
1378         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1379
1380         offset = (nj1-nj0)%4;
1381
1382         rbai   = _mm_load1_ps(rb+ii);
1383         fix    = _mm_setzero_ps();
1384         fiy    = _mm_setzero_ps();
1385         fiz    = _mm_setzero_ps();
1386
1387
1388         for (k = nj0; k < nj1-offset; k += 4)
1389         {
1390             jnrA        = jjnr[k];
1391             jnrB        = jjnr[k+1];
1392             jnrC        = jjnr[k+2];
1393             jnrD        = jjnr[k+3];
1394
1395             j3A         = 3*jnrA;
1396             j3B         = 3*jnrB;
1397             j3C         = 3*jnrC;
1398             j3D         = 3*jnrD;
1399
1400             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1401
1402             dx          = _mm_sub_ps(ix, jx);
1403             dy          = _mm_sub_ps(iy, jy);
1404             dz          = _mm_sub_ps(iz, jz);
1405
1406             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1407
1408             /* load chain rule terms for j1-4 */
1409             f_gb        = _mm_load_ps(dadx);
1410             dadx       += 4;
1411             f_gb_ai     = _mm_load_ps(dadx);
1412             dadx       += 4;
1413
1414             /* calculate scalar force */
1415             f_gb    = _mm_mul_ps(f_gb, rbai);
1416             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1417             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1418
1419             tx     = _mm_mul_ps(f_gb, dx);
1420             ty     = _mm_mul_ps(f_gb, dy);
1421             tz     = _mm_mul_ps(f_gb, dz);
1422
1423             fix    = _mm_add_ps(fix, tx);
1424             fiy    = _mm_add_ps(fiy, ty);
1425             fiz    = _mm_add_ps(fiz, tz);
1426
1427             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1428         }
1429
1430         /*deal with odd elements */
1431         if (offset != 0)
1432         {
1433             if (offset == 1)
1434             {
1435                 jnrA        = jjnr[k];
1436                 j3A         = 3*jnrA;
1437                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1438                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1439             }
1440             else if (offset == 2)
1441             {
1442                 jnrA        = jjnr[k];
1443                 jnrB        = jjnr[k+1];
1444                 j3A         = 3*jnrA;
1445                 j3B         = 3*jnrB;
1446                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1447                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1448             }
1449             else
1450             {
1451                 /* offset must be 3 */
1452                 jnrA        = jjnr[k];
1453                 jnrB        = jjnr[k+1];
1454                 jnrC        = jjnr[k+2];
1455                 j3A         = 3*jnrA;
1456                 j3B         = 3*jnrB;
1457                 j3C         = 3*jnrC;
1458                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1459                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1460             }
1461
1462             dx          = _mm_sub_ps(ix, jx);
1463             dy          = _mm_sub_ps(iy, jy);
1464             dz          = _mm_sub_ps(iz, jz);
1465
1466             /* load chain rule terms for j1-4 */
1467             f_gb        = _mm_load_ps(dadx);
1468             dadx       += 4;
1469             f_gb_ai     = _mm_load_ps(dadx);
1470             dadx       += 4;
1471
1472             /* calculate scalar force */
1473             f_gb    = _mm_mul_ps(f_gb, rbai);
1474             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1475             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1476
1477             tx     = _mm_mul_ps(f_gb, dx);
1478             ty     = _mm_mul_ps(f_gb, dy);
1479             tz     = _mm_mul_ps(f_gb, dz);
1480
1481             fix    = _mm_add_ps(fix, tx);
1482             fiy    = _mm_add_ps(fiy, ty);
1483             fiz    = _mm_add_ps(fiz, tz);
1484
1485             if (offset == 1)
1486             {
1487                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1488             }
1489             else if (offset == 2)
1490             {
1491                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1492             }
1493             else
1494             {
1495                 /* offset must be 3 */
1496                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1497             }
1498         }
1499
1500         /* fix/fiy/fiz now contain four partial force terms, that all should be
1501          * added to the i particle forces and shift forces.
1502          */
1503         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1504     }
1505
1506     return 0;
1507 }
1508
1509
1510 #else
1511 /* keep compiler happy */
1512 int genborn_sse_dummy;
1513
1514 #endif /* SSE intrinsics available */