Merge branch 'release-4-6' into master
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_single.c
1 /*
2  *
3  *                This source code is part of
4  *
5  *                 G   R   O   M   A   C   S
6  *
7  *          GROningen MAchine for Chemical Simulations
8  *
9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11  * Copyright (c) 2001-2008, The GROMACS development team,
12  * check out http://www.gromacs.org for more information.
13
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * as published by the Free Software Foundation; either version 2
17  * of the License, or (at your option) any later version.
18  *
19  * If you want to redistribute modifications, please consider that
20  * scientific software is very special. Version control is crucial -
21  * bugs must be traceable. We will be happy to consider code for
22  * inclusion in the official distribution, but derived work must not
23  * be called official GROMACS. Details are found in the README & COPYING
24  * files - if they are missing, get the official version at www.gromacs.org.
25  *
26  * To help us fund GROMACS development, we humbly ask that you cite
27  * the papers on the package - you can find them in the top README file.
28  *
29  * For more info, check our website at http://www.gromacs.org
30  *
31  * And Hey:
32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
33  */
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37
38 #include <math.h>
39 #include <string.h>
40
41 #include "typedefs.h"
42 #include "smalloc.h"
43 #include "genborn.h"
44 #include "vec.h"
45 #include "grompp.h"
46 #include "pdbio.h"
47 #include "names.h"
48 #include "physics.h"
49 #include "partdec.h"
50 #include "domdec.h"
51 #include "network.h"
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
54 #include "genborn.h"
55
56 #include "gromacs/utility/gmxmpi.h"
57
58
59 /* Only compile this file if SSE intrinsics are available */
60 #if 0 && defined (GMX_X86_SSE2)
61
62 #include <gmx_sse2_single.h>
63 #include <emmintrin.h>
64
65 #include "genborn_sse2_single.h"
66
67
68 int
69 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
70                               int natoms, gmx_localtop_t *top,
71                               const t_atomtypes *atype, float *x, t_nblist *nl,
72                               gmx_genborn_t *born)
73 {
74     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
75     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
76     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
77     int          shift;
78     int         *mdtype;
79     real         shX, shY, shZ;
80     int         *jjnr;
81     real        *shiftvec;
82
83     float        gpi_ai, gpi2;
84     float        factor;
85     float       *gb_radius;
86     float       *vsolv;
87     float       *work;
88     float       *dadx;
89
90     __m128       ix, iy, iz;
91     __m128       jx, jy, jz;
92     __m128       dx, dy, dz;
93     __m128       tx, ty, tz;
94     __m128       jxB, jyB, jzB;
95     __m128       dxB, dyB, dzB;
96     __m128       txB, tyB, tzB;
97     __m128       rsq, rinv, rinv2, rinv4, rinv6;
98     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
99     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
100     __m128       ratioB, rajB, vajB, rvdwB;
101     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
102     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
103     __m128       mask, icf4, icf6, mask_cmp;
104     __m128       icf4B, icf6B, mask_cmpB;
105
106     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
107     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
108     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
109
110     const __m128 half   = _mm_set1_ps(0.5f);
111     const __m128 three  = _mm_set1_ps(3.0f);
112     const __m128 one    = _mm_set1_ps(1.0f);
113     const __m128 two    = _mm_set1_ps(2.0f);
114     const __m128 zero   = _mm_set1_ps(0.0f);
115     const __m128 four   = _mm_set1_ps(4.0f);
116
117     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
118     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
119     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
120
121     factor  = 0.5 * ONE_4PI_EPS0;
122
123     gb_radius = born->gb_radius;
124     vsolv     = born->vsolv;
125     work      = born->gpol_still_work;
126     jjnr      = nl->jjnr;
127     shiftvec  = fr->shift_vec[0];
128     dadx      = fr->dadx;
129
130     jnrA = jnrB = jnrC = jnrD = 0;
131     jx   = _mm_setzero_ps();
132     jy   = _mm_setzero_ps();
133     jz   = _mm_setzero_ps();
134
135     n = 0;
136
137     for (i = 0; i < natoms; i++)
138     {
139         work[i] = 0;
140     }
141
142     for (i = 0; i < nl->nri; i++)
143     {
144         ii     = nl->iinr[i];
145         ii3    = ii*3;
146         is3    = 3*nl->shift[i];
147         shX    = shiftvec[is3];
148         shY    = shiftvec[is3+1];
149         shZ    = shiftvec[is3+2];
150         nj0    = nl->jindex[i];
151         nj1    = nl->jindex[i+1];
152
153         ix     = _mm_set1_ps(shX+x[ii3+0]);
154         iy     = _mm_set1_ps(shY+x[ii3+1]);
155         iz     = _mm_set1_ps(shZ+x[ii3+2]);
156
157         offset = (nj1-nj0)%4;
158
159         /* Polarization energy for atom ai */
160         gpi    = _mm_setzero_ps();
161
162         rai     = _mm_load1_ps(gb_radius+ii);
163         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
164
165         for (k = nj0; k < nj1-4-offset; k += 8)
166         {
167             jnrA        = jjnr[k];
168             jnrB        = jjnr[k+1];
169             jnrC        = jjnr[k+2];
170             jnrD        = jjnr[k+3];
171             jnrE        = jjnr[k+4];
172             jnrF        = jjnr[k+5];
173             jnrG        = jjnr[k+6];
174             jnrH        = jjnr[k+7];
175
176             j3A         = 3*jnrA;
177             j3B         = 3*jnrB;
178             j3C         = 3*jnrC;
179             j3D         = 3*jnrD;
180             j3E         = 3*jnrE;
181             j3F         = 3*jnrF;
182             j3G         = 3*jnrG;
183             j3H         = 3*jnrH;
184
185             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
186             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
187
188             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
189             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
190             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
191             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
192
193             dx          = _mm_sub_ps(ix, jx);
194             dy          = _mm_sub_ps(iy, jy);
195             dz          = _mm_sub_ps(iz, jz);
196             dxB         = _mm_sub_ps(ix, jxB);
197             dyB         = _mm_sub_ps(iy, jyB);
198             dzB         = _mm_sub_ps(iz, jzB);
199
200             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
201             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
202             rinv        = gmx_mm_invsqrt_ps(rsq);
203             rinvB       = gmx_mm_invsqrt_ps(rsqB);
204             rinv2       = _mm_mul_ps(rinv, rinv);
205             rinv2B      = _mm_mul_ps(rinvB, rinvB);
206             rinv4       = _mm_mul_ps(rinv2, rinv2);
207             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
208             rinv6       = _mm_mul_ps(rinv4, rinv2);
209             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
210
211             rvdw        = _mm_add_ps(rai, raj);
212             rvdwB       = _mm_add_ps(rai, rajB);
213             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
214             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
215
216             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
217             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
218
219             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
220             if (0 == _mm_movemask_ps(mask_cmp) )
221             {
222                 /* if ratio>still_p5inv for ALL elements */
223                 ccf         = one;
224                 dccf        = _mm_setzero_ps();
225             }
226             else
227             {
228                 ratio       = _mm_min_ps(ratio, still_p5inv);
229                 theta       = _mm_mul_ps(ratio, still_pip5);
230                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
231                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
232                 ccf         = _mm_mul_ps(term, term);
233                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
234                                          _mm_mul_ps(sinq, theta));
235             }
236             if (0 == _mm_movemask_ps(mask_cmpB) )
237             {
238                 /* if ratio>still_p5inv for ALL elements */
239                 ccfB        = one;
240                 dccfB       = _mm_setzero_ps();
241             }
242             else
243             {
244                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
245                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
246                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
247                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
248                 ccfB        = _mm_mul_ps(termB, termB);
249                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
250                                          _mm_mul_ps(sinqB, thetaB));
251             }
252
253             prod        = _mm_mul_ps(still_p4, vaj);
254             prodB       = _mm_mul_ps(still_p4, vajB);
255             icf4        = _mm_mul_ps(ccf, rinv4);
256             icf4B       = _mm_mul_ps(ccfB, rinv4B);
257             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
258             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
259
260             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
261             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
262
263             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
264
265             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
266             dadx += 4;
267             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
268             dadx += 4;
269             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
270             dadx += 4;
271             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
272             dadx += 4;
273         }
274
275         for (; k < nj1-offset; k += 4)
276         {
277             jnrA        = jjnr[k];
278             jnrB        = jjnr[k+1];
279             jnrC        = jjnr[k+2];
280             jnrD        = jjnr[k+3];
281
282             j3A         = 3*jnrA;
283             j3B         = 3*jnrB;
284             j3C         = 3*jnrC;
285             j3D         = 3*jnrD;
286
287             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
288
289             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
290             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
291
292             dx          = _mm_sub_ps(ix, jx);
293             dy          = _mm_sub_ps(iy, jy);
294             dz          = _mm_sub_ps(iz, jz);
295
296             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
297             rinv        = gmx_mm_invsqrt_ps(rsq);
298             rinv2       = _mm_mul_ps(rinv, rinv);
299             rinv4       = _mm_mul_ps(rinv2, rinv2);
300             rinv6       = _mm_mul_ps(rinv4, rinv2);
301
302             rvdw        = _mm_add_ps(rai, raj);
303             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
304
305             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
306
307             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
308             if (0 == _mm_movemask_ps(mask_cmp))
309             {
310                 /* if ratio>still_p5inv for ALL elements */
311                 ccf         = one;
312                 dccf        = _mm_setzero_ps();
313             }
314             else
315             {
316                 ratio       = _mm_min_ps(ratio, still_p5inv);
317                 theta       = _mm_mul_ps(ratio, still_pip5);
318                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
319                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
320                 ccf         = _mm_mul_ps(term, term);
321                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
322                                          _mm_mul_ps(sinq, theta));
323             }
324
325             prod        = _mm_mul_ps(still_p4, vaj);
326             icf4        = _mm_mul_ps(ccf, rinv4);
327             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
328
329             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
330
331             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
332
333             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
334             dadx += 4;
335             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
336             dadx += 4;
337         }
338
339         if (offset != 0)
340         {
341             if (offset == 1)
342             {
343                 jnrA        = jjnr[k];
344                 j3A         = 3*jnrA;
345                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
346                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
347                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
348                 mask        = mask1;
349             }
350             else if (offset == 2)
351             {
352                 jnrA        = jjnr[k];
353                 jnrB        = jjnr[k+1];
354                 j3A         = 3*jnrA;
355                 j3B         = 3*jnrB;
356                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
357                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
358                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
359                 mask        = mask2;
360             }
361             else
362             {
363                 /* offset must be 3 */
364                 jnrA        = jjnr[k];
365                 jnrB        = jjnr[k+1];
366                 jnrC        = jjnr[k+2];
367                 j3A         = 3*jnrA;
368                 j3B         = 3*jnrB;
369                 j3C         = 3*jnrC;
370                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
371                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
372                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
373                 mask        = mask3;
374             }
375
376             dx          = _mm_sub_ps(ix, jx);
377             dy          = _mm_sub_ps(iy, jy);
378             dz          = _mm_sub_ps(iz, jz);
379
380             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
381             rinv        = gmx_mm_invsqrt_ps(rsq);
382             rinv2       = _mm_mul_ps(rinv, rinv);
383             rinv4       = _mm_mul_ps(rinv2, rinv2);
384             rinv6       = _mm_mul_ps(rinv4, rinv2);
385
386             rvdw        = _mm_add_ps(rai, raj);
387             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
388
389             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
390
391             if (0 == _mm_movemask_ps(mask_cmp))
392             {
393                 /* if ratio>still_p5inv for ALL elements */
394                 ccf         = one;
395                 dccf        = _mm_setzero_ps();
396             }
397             else
398             {
399                 ratio       = _mm_min_ps(ratio, still_p5inv);
400                 theta       = _mm_mul_ps(ratio, still_pip5);
401                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
402                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
403                 ccf         = _mm_mul_ps(term, term);
404                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
405                                          _mm_mul_ps(sinq, theta));
406             }
407
408             prod        = _mm_mul_ps(still_p4, vaj);
409             icf4        = _mm_mul_ps(ccf, rinv4);
410             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
411
412             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
413
414             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
415             dadx += 4;
416             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
417             dadx += 4;
418
419             tmp = _mm_mul_ps(prod_ai, icf4);
420
421             if (offset == 1)
422             {
423                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
424             }
425             else if (offset == 2)
426             {
427                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
428             }
429             else
430             {
431                 /* offset must be 3 */
432                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
433             }
434         }
435         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
436     }
437
438     /* Sum up the polarization energy from other nodes */
439     if (PARTDECOMP(cr))
440     {
441         gmx_sum(natoms, work, cr);
442     }
443     else if (DOMAINDECOMP(cr))
444     {
445         dd_atom_sum_real(cr->dd, work);
446     }
447
448     /* Compute the radii */
449     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
450     {
451         if (born->use[i] != 0)
452         {
453             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
454             gpi2             = gpi_ai * gpi_ai;
455             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
456             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
457         }
458     }
459
460     /* Extra (local) communication required for DD */
461     if (DOMAINDECOMP(cr))
462     {
463         dd_atom_spread_real(cr->dd, born->bRad);
464         dd_atom_spread_real(cr->dd, fr->invsqrta);
465     }
466
467     return 0;
468 }
469
470
471 int
472 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
473                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
474 {
475     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
476     int          jnrA, jnrB, jnrC, jnrD;
477     int          j3A, j3B, j3C, j3D;
478     int          jnrE, jnrF, jnrG, jnrH;
479     int          j3E, j3F, j3G, j3H;
480     float        shX, shY, shZ;
481     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
482     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
483     float       *obc_param;
484     float       *gb_radius;
485     float       *work;
486     int       *  jjnr;
487     float       *dadx;
488     float       *shiftvec;
489     float        min_rad, rad;
490
491     __m128       ix, iy, iz, jx, jy, jz;
492     __m128       dx, dy, dz, t1, t2, t3, t4;
493     __m128       rsq, rinv, r;
494     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
495     __m128       uij, lij2, uij2, lij3, uij3, diff2;
496     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
497     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
498     __m128       dadx1, dadx2;
499     __m128       logterm;
500     __m128       mask;
501     __m128       obc_mask1, obc_mask2, obc_mask3;
502     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
503     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
504     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
505     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
506     __m128       lij_invB, sk2_invB, prodB;
507     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
508     __m128       dadx1B, dadx2B;
509     __m128       logtermB;
510     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
511
512     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
513     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
514     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
515
516     __m128       oneeighth   = _mm_set1_ps(0.125);
517     __m128       onefourth   = _mm_set1_ps(0.25);
518
519     const __m128 half  = _mm_set1_ps(0.5f);
520     const __m128 three = _mm_set1_ps(3.0f);
521     const __m128 one   = _mm_set1_ps(1.0f);
522     const __m128 two   = _mm_set1_ps(2.0f);
523     const __m128 zero  = _mm_set1_ps(0.0f);
524     const __m128 neg   = _mm_set1_ps(-1.0f);
525
526     /* Set the dielectric offset */
527     doffset   = born->gb_doffset;
528     gb_radius = born->gb_radius;
529     obc_param = born->param;
530     work      = born->gpol_hct_work;
531     jjnr      = nl->jjnr;
532     dadx      = fr->dadx;
533     shiftvec  = fr->shift_vec[0];
534
535     jx        = _mm_setzero_ps();
536     jy        = _mm_setzero_ps();
537     jz        = _mm_setzero_ps();
538
539     jnrA = jnrB = jnrC = jnrD = 0;
540
541     for (i = 0; i < born->nr; i++)
542     {
543         work[i] = 0;
544     }
545
546     for (i = 0; i < nl->nri; i++)
547     {
548         ii     = nl->iinr[i];
549         ii3    = ii*3;
550         is3    = 3*nl->shift[i];
551         shX    = shiftvec[is3];
552         shY    = shiftvec[is3+1];
553         shZ    = shiftvec[is3+2];
554         nj0    = nl->jindex[i];
555         nj1    = nl->jindex[i+1];
556
557         ix     = _mm_set1_ps(shX+x[ii3+0]);
558         iy     = _mm_set1_ps(shY+x[ii3+1]);
559         iz     = _mm_set1_ps(shZ+x[ii3+2]);
560
561         offset = (nj1-nj0)%4;
562
563         rai     = _mm_load1_ps(gb_radius+ii);
564         rai_inv = gmx_mm_inv_ps(rai);
565
566         sum_ai = _mm_setzero_ps();
567
568         sk_ai  = _mm_load1_ps(born->param+ii);
569         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
570
571         for (k = nj0; k < nj1-4-offset; k += 8)
572         {
573             jnrA        = jjnr[k];
574             jnrB        = jjnr[k+1];
575             jnrC        = jjnr[k+2];
576             jnrD        = jjnr[k+3];
577             jnrE        = jjnr[k+4];
578             jnrF        = jjnr[k+5];
579             jnrG        = jjnr[k+6];
580             jnrH        = jjnr[k+7];
581
582             j3A         = 3*jnrA;
583             j3B         = 3*jnrB;
584             j3C         = 3*jnrC;
585             j3D         = 3*jnrD;
586             j3E         = 3*jnrE;
587             j3F         = 3*jnrF;
588             j3G         = 3*jnrG;
589             j3H         = 3*jnrH;
590
591             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
592             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
593             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
594             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
595             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
596             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
597
598             dx    = _mm_sub_ps(ix, jx);
599             dy    = _mm_sub_ps(iy, jy);
600             dz    = _mm_sub_ps(iz, jz);
601             dxB   = _mm_sub_ps(ix, jxB);
602             dyB   = _mm_sub_ps(iy, jyB);
603             dzB   = _mm_sub_ps(iz, jzB);
604
605             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
606             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
607
608             rinv        = gmx_mm_invsqrt_ps(rsq);
609             r           = _mm_mul_ps(rsq, rinv);
610             rinvB       = gmx_mm_invsqrt_ps(rsqB);
611             rB          = _mm_mul_ps(rsqB, rinvB);
612
613             /* Compute raj_inv aj1-4 */
614             raj_inv     = gmx_mm_inv_ps(raj);
615             raj_invB    = gmx_mm_inv_ps(rajB);
616
617             /* Evaluate influence of atom aj -> ai */
618             t1            = _mm_add_ps(r, sk_aj);
619             t2            = _mm_sub_ps(r, sk_aj);
620             t3            = _mm_sub_ps(sk_aj, r);
621             t1B           = _mm_add_ps(rB, sk_ajB);
622             t2B           = _mm_sub_ps(rB, sk_ajB);
623             t3B           = _mm_sub_ps(sk_ajB, rB);
624             obc_mask1     = _mm_cmplt_ps(rai, t1);
625             obc_mask2     = _mm_cmplt_ps(rai, t2);
626             obc_mask3     = _mm_cmplt_ps(rai, t3);
627             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
628             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
629             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
630
631             uij           = gmx_mm_inv_ps(t1);
632             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
633                                          _mm_andnot_ps(obc_mask2, rai_inv));
634             dlij          = _mm_and_ps(one, obc_mask2);
635             uij2          = _mm_mul_ps(uij, uij);
636             uij3          = _mm_mul_ps(uij2, uij);
637             lij2          = _mm_mul_ps(lij, lij);
638             lij3          = _mm_mul_ps(lij2, lij);
639
640             uijB          = gmx_mm_inv_ps(t1B);
641             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
642                                          _mm_andnot_ps(obc_mask2B, rai_inv));
643             dlijB         = _mm_and_ps(one, obc_mask2B);
644             uij2B         = _mm_mul_ps(uijB, uijB);
645             uij3B         = _mm_mul_ps(uij2B, uijB);
646             lij2B         = _mm_mul_ps(lijB, lijB);
647             lij3B         = _mm_mul_ps(lij2B, lijB);
648
649             diff2         = _mm_sub_ps(uij2, lij2);
650             lij_inv       = gmx_mm_invsqrt_ps(lij2);
651             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
652             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
653             prod          = _mm_mul_ps(onefourth, sk2_rinv);
654
655             diff2B        = _mm_sub_ps(uij2B, lij2B);
656             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
657             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
658             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
659             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
660
661             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
662             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
663
664             t1            = _mm_sub_ps(lij, uij);
665             t2            = _mm_mul_ps(diff2,
666                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
667                                                   prod));
668             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
669             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
670             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
671             t4            = _mm_and_ps(t4, obc_mask3);
672             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
673
674             t1B           = _mm_sub_ps(lijB, uijB);
675             t2B           = _mm_mul_ps(diff2B,
676                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
677                                                   prodB));
678             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
679             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
680             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
681             t4B           = _mm_and_ps(t4B, obc_mask3B);
682             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
683
684             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
685
686             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
687                                        _mm_mul_ps(prod, lij3));
688             t1            = _mm_sub_ps(t1,
689                                        _mm_mul_ps(onefourth,
690                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
691                                                              _mm_mul_ps(lij3, r))));
692             t2            = _mm_mul_ps(onefourth,
693                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
694                                                   _mm_mul_ps(uij3, r)));
695             t2            = _mm_sub_ps(t2,
696                                        _mm_add_ps(_mm_mul_ps(half, uij2),
697                                                   _mm_mul_ps(prod, uij3)));
698             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
699                                        _mm_mul_ps(rinv, rinv));
700             t3            = _mm_sub_ps(t3,
701                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
702                                                   _mm_add_ps(one,
703                                                              _mm_mul_ps(sk2_rinv, rinv))));
704             t1            = _mm_mul_ps(rinv,
705                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
706                                                   _mm_add_ps(t2, t3)));
707
708
709
710             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
711                                        _mm_mul_ps(prodB, lij3B));
712             t1B           = _mm_sub_ps(t1B,
713                                        _mm_mul_ps(onefourth,
714                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
715                                                              _mm_mul_ps(lij3B, rB))));
716             t2B           = _mm_mul_ps(onefourth,
717                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
718                                                   _mm_mul_ps(uij3B, rB)));
719             t2B           = _mm_sub_ps(t2B,
720                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
721                                                   _mm_mul_ps(prodB, uij3B)));
722             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
723                                        _mm_mul_ps(rinvB, rinvB));
724             t3B           = _mm_sub_ps(t3B,
725                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
726                                                   _mm_add_ps(one,
727                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
728             t1B           = _mm_mul_ps(rinvB,
729                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
730                                                   _mm_add_ps(t2B, t3B)));
731
732             dadx1         = _mm_and_ps(t1, obc_mask1);
733             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
734
735
736             /* Evaluate influence of atom ai -> aj */
737             t1            = _mm_add_ps(r, sk_ai);
738             t2            = _mm_sub_ps(r, sk_ai);
739             t3            = _mm_sub_ps(sk_ai, r);
740             t1B           = _mm_add_ps(rB, sk_ai);
741             t2B           = _mm_sub_ps(rB, sk_ai);
742             t3B           = _mm_sub_ps(sk_ai, rB);
743             obc_mask1     = _mm_cmplt_ps(raj, t1);
744             obc_mask2     = _mm_cmplt_ps(raj, t2);
745             obc_mask3     = _mm_cmplt_ps(raj, t3);
746             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
747             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
748             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
749
750             uij           = gmx_mm_inv_ps(t1);
751             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
752                                          _mm_andnot_ps(obc_mask2, raj_inv));
753             dlij          = _mm_and_ps(one, obc_mask2);
754             uij2          = _mm_mul_ps(uij, uij);
755             uij3          = _mm_mul_ps(uij2, uij);
756             lij2          = _mm_mul_ps(lij, lij);
757             lij3          = _mm_mul_ps(lij2, lij);
758
759             uijB          = gmx_mm_inv_ps(t1B);
760             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
761                                          _mm_andnot_ps(obc_mask2B, raj_invB));
762             dlijB         = _mm_and_ps(one, obc_mask2B);
763             uij2B         = _mm_mul_ps(uijB, uijB);
764             uij3B         = _mm_mul_ps(uij2B, uijB);
765             lij2B         = _mm_mul_ps(lijB, lijB);
766             lij3B         = _mm_mul_ps(lij2B, lijB);
767
768             diff2         = _mm_sub_ps(uij2, lij2);
769             lij_inv       = gmx_mm_invsqrt_ps(lij2);
770             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
771             prod          = _mm_mul_ps(onefourth, sk2_rinv);
772
773             diff2B        = _mm_sub_ps(uij2B, lij2B);
774             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
775             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
776             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
777
778             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
779             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
780
781             t1            = _mm_sub_ps(lij, uij);
782             t2            = _mm_mul_ps(diff2,
783                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
784                                                   prod));
785             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
786             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
787             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
788             t4            = _mm_and_ps(t4, obc_mask3);
789             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
790
791             t1B           = _mm_sub_ps(lijB, uijB);
792             t2B           = _mm_mul_ps(diff2B,
793                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
794                                                   prodB));
795             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
796             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
797             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
798             t4B           = _mm_and_ps(t4B, obc_mask3B);
799             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
800
801             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
802             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
803
804             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
805                                        _mm_mul_ps(prod, lij3));
806             t1            = _mm_sub_ps(t1,
807                                        _mm_mul_ps(onefourth,
808                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
809                                                              _mm_mul_ps(lij3, r))));
810             t2            = _mm_mul_ps(onefourth,
811                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
812                                                   _mm_mul_ps(uij3, r)));
813             t2            = _mm_sub_ps(t2,
814                                        _mm_add_ps(_mm_mul_ps(half, uij2),
815                                                   _mm_mul_ps(prod, uij3)));
816             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
817                                        _mm_mul_ps(rinv, rinv));
818             t3            = _mm_sub_ps(t3,
819                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
820                                                   _mm_add_ps(one,
821                                                              _mm_mul_ps(sk2_rinv, rinv))));
822             t1            = _mm_mul_ps(rinv,
823                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
824                                                   _mm_add_ps(t2, t3)));
825
826
827             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
828                                        _mm_mul_ps(prodB, lij3B));
829             t1B           = _mm_sub_ps(t1B,
830                                        _mm_mul_ps(onefourth,
831                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
832                                                              _mm_mul_ps(lij3B, rB))));
833             t2B           = _mm_mul_ps(onefourth,
834                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
835                                                   _mm_mul_ps(uij3B, rB)));
836             t2B           = _mm_sub_ps(t2B,
837                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
838                                                   _mm_mul_ps(prodB, uij3B)));
839             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
840                                        _mm_mul_ps(rinvB, rinvB));
841             t3B           = _mm_sub_ps(t3B,
842                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
843                                                   _mm_add_ps(one,
844                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
845             t1B           = _mm_mul_ps(rinvB,
846                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
847                                                   _mm_add_ps(t2B, t3B)));
848
849
850             dadx2         = _mm_and_ps(t1, obc_mask1);
851             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
852
853             _mm_store_ps(dadx, dadx1);
854             dadx += 4;
855             _mm_store_ps(dadx, dadx2);
856             dadx += 4;
857             _mm_store_ps(dadx, dadx1B);
858             dadx += 4;
859             _mm_store_ps(dadx, dadx2B);
860             dadx += 4;
861
862         } /* end normal inner loop */
863
864         for (; k < nj1-offset; k += 4)
865         {
866             jnrA        = jjnr[k];
867             jnrB        = jjnr[k+1];
868             jnrC        = jjnr[k+2];
869             jnrD        = jjnr[k+3];
870
871             j3A         = 3*jnrA;
872             j3B         = 3*jnrB;
873             j3C         = 3*jnrC;
874             j3D         = 3*jnrD;
875
876             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
877             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
878             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
879
880             dx    = _mm_sub_ps(ix, jx);
881             dy    = _mm_sub_ps(iy, jy);
882             dz    = _mm_sub_ps(iz, jz);
883
884             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
885
886             rinv        = gmx_mm_invsqrt_ps(rsq);
887             r           = _mm_mul_ps(rsq, rinv);
888
889             /* Compute raj_inv aj1-4 */
890             raj_inv     = gmx_mm_inv_ps(raj);
891
892             /* Evaluate influence of atom aj -> ai */
893             t1            = _mm_add_ps(r, sk_aj);
894             obc_mask1     = _mm_cmplt_ps(rai, t1);
895
896             if (_mm_movemask_ps(obc_mask1))
897             {
898                 /* If any of the elements has rai<dr+sk, this is executed */
899                 t2            = _mm_sub_ps(r, sk_aj);
900                 t3            = _mm_sub_ps(sk_aj, r);
901
902                 obc_mask2     = _mm_cmplt_ps(rai, t2);
903                 obc_mask3     = _mm_cmplt_ps(rai, t3);
904
905                 uij           = gmx_mm_inv_ps(t1);
906                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
907                                              _mm_andnot_ps(obc_mask2, rai_inv));
908                 dlij          = _mm_and_ps(one, obc_mask2);
909                 uij2          = _mm_mul_ps(uij, uij);
910                 uij3          = _mm_mul_ps(uij2, uij);
911                 lij2          = _mm_mul_ps(lij, lij);
912                 lij3          = _mm_mul_ps(lij2, lij);
913                 diff2         = _mm_sub_ps(uij2, lij2);
914                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
915                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
916                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
917                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
918                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
919                 t1            = _mm_sub_ps(lij, uij);
920                 t2            = _mm_mul_ps(diff2,
921                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
922                                                       prod));
923                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
924                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
925                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
926                 t4            = _mm_and_ps(t4, obc_mask3);
927                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
928                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
929                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
930                                            _mm_mul_ps(prod, lij3));
931                 t1            = _mm_sub_ps(t1,
932                                            _mm_mul_ps(onefourth,
933                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
934                                                                  _mm_mul_ps(lij3, r))));
935                 t2            = _mm_mul_ps(onefourth,
936                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
937                                                       _mm_mul_ps(uij3, r)));
938                 t2            = _mm_sub_ps(t2,
939                                            _mm_add_ps(_mm_mul_ps(half, uij2),
940                                                       _mm_mul_ps(prod, uij3)));
941                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
942                                            _mm_mul_ps(rinv, rinv));
943                 t3            = _mm_sub_ps(t3,
944                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
945                                                       _mm_add_ps(one,
946                                                                  _mm_mul_ps(sk2_rinv, rinv))));
947                 t1            = _mm_mul_ps(rinv,
948                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
949                                                       _mm_add_ps(t2, t3)));
950
951                 dadx1         = _mm_and_ps(t1, obc_mask1);
952             }
953             else
954             {
955                 dadx1         = _mm_setzero_ps();
956             }
957
958             /* Evaluate influence of atom ai -> aj */
959             t1            = _mm_add_ps(r, sk_ai);
960             obc_mask1     = _mm_cmplt_ps(raj, t1);
961
962             if (_mm_movemask_ps(obc_mask1))
963             {
964                 t2            = _mm_sub_ps(r, sk_ai);
965                 t3            = _mm_sub_ps(sk_ai, r);
966                 obc_mask2     = _mm_cmplt_ps(raj, t2);
967                 obc_mask3     = _mm_cmplt_ps(raj, t3);
968
969                 uij           = gmx_mm_inv_ps(t1);
970                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
971                                              _mm_andnot_ps(obc_mask2, raj_inv));
972                 dlij          = _mm_and_ps(one, obc_mask2);
973                 uij2          = _mm_mul_ps(uij, uij);
974                 uij3          = _mm_mul_ps(uij2, uij);
975                 lij2          = _mm_mul_ps(lij, lij);
976                 lij3          = _mm_mul_ps(lij2, lij);
977                 diff2         = _mm_sub_ps(uij2, lij2);
978                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
979                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
980                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
981                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
982                 t1            = _mm_sub_ps(lij, uij);
983                 t2            = _mm_mul_ps(diff2,
984                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
985                                                       prod));
986                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
987                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
988                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
989                 t4            = _mm_and_ps(t4, obc_mask3);
990                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
991
992                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
993
994                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
995                                            _mm_mul_ps(prod, lij3));
996                 t1            = _mm_sub_ps(t1,
997                                            _mm_mul_ps(onefourth,
998                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
999                                                                  _mm_mul_ps(lij3, r))));
1000                 t2            = _mm_mul_ps(onefourth,
1001                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1002                                                       _mm_mul_ps(uij3, r)));
1003                 t2            = _mm_sub_ps(t2,
1004                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1005                                                       _mm_mul_ps(prod, uij3)));
1006                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1007                                            _mm_mul_ps(rinv, rinv));
1008                 t3            = _mm_sub_ps(t3,
1009                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1010                                                       _mm_add_ps(one,
1011                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1012                 t1            = _mm_mul_ps(rinv,
1013                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1014                                                       _mm_add_ps(t2, t3)));
1015                 dadx2         = _mm_and_ps(t1, obc_mask1);
1016             }
1017             else
1018             {
1019                 dadx2         = _mm_setzero_ps();
1020             }
1021
1022             _mm_store_ps(dadx, dadx1);
1023             dadx += 4;
1024             _mm_store_ps(dadx, dadx2);
1025             dadx += 4;
1026         } /* end normal inner loop */
1027
1028         if (offset != 0)
1029         {
1030             if (offset == 1)
1031             {
1032                 jnrA        = jjnr[k];
1033                 j3A         = 3*jnrA;
1034                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1035                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1036                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1037                 mask        = mask1;
1038             }
1039             else if (offset == 2)
1040             {
1041                 jnrA        = jjnr[k];
1042                 jnrB        = jjnr[k+1];
1043                 j3A         = 3*jnrA;
1044                 j3B         = 3*jnrB;
1045                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1046                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1047                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1048                 mask        = mask2;
1049             }
1050             else
1051             {
1052                 /* offset must be 3 */
1053                 jnrA        = jjnr[k];
1054                 jnrB        = jjnr[k+1];
1055                 jnrC        = jjnr[k+2];
1056                 j3A         = 3*jnrA;
1057                 j3B         = 3*jnrB;
1058                 j3C         = 3*jnrC;
1059                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1060                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1061                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1062                 mask        = mask3;
1063             }
1064
1065             dx    = _mm_sub_ps(ix, jx);
1066             dy    = _mm_sub_ps(iy, jy);
1067             dz    = _mm_sub_ps(iz, jz);
1068
1069             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1070
1071             rinv        = gmx_mm_invsqrt_ps(rsq);
1072             r           = _mm_mul_ps(rsq, rinv);
1073
1074             /* Compute raj_inv aj1-4 */
1075             raj_inv     = gmx_mm_inv_ps(raj);
1076
1077             /* Evaluate influence of atom aj -> ai */
1078             t1            = _mm_add_ps(r, sk_aj);
1079             obc_mask1     = _mm_cmplt_ps(rai, t1);
1080             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1081
1082             if (_mm_movemask_ps(obc_mask1))
1083             {
1084                 t2            = _mm_sub_ps(r, sk_aj);
1085                 t3            = _mm_sub_ps(sk_aj, r);
1086                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1087                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1088
1089                 uij           = gmx_mm_inv_ps(t1);
1090                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1091                                              _mm_andnot_ps(obc_mask2, rai_inv));
1092                 dlij           = _mm_and_ps(one, obc_mask2);
1093                 uij2           = _mm_mul_ps(uij, uij);
1094                 uij3           = _mm_mul_ps(uij2, uij);
1095                 lij2           = _mm_mul_ps(lij, lij);
1096                 lij3           = _mm_mul_ps(lij2, lij);
1097                 diff2          = _mm_sub_ps(uij2, lij2);
1098                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1099                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1100                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1101                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1102                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1103                 t1             = _mm_sub_ps(lij, uij);
1104                 t2             = _mm_mul_ps(diff2,
1105                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1106                                                        prod));
1107                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1108                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1109                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1110                 t4            = _mm_and_ps(t4, obc_mask3);
1111                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1112                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1113                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1114                                            _mm_mul_ps(prod, lij3));
1115                 t1            = _mm_sub_ps(t1,
1116                                            _mm_mul_ps(onefourth,
1117                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1118                                                                  _mm_mul_ps(lij3, r))));
1119                 t2            = _mm_mul_ps(onefourth,
1120                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1121                                                       _mm_mul_ps(uij3, r)));
1122                 t2            = _mm_sub_ps(t2,
1123                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1124                                                       _mm_mul_ps(prod, uij3)));
1125                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1126                                            _mm_mul_ps(rinv, rinv));
1127                 t3            = _mm_sub_ps(t3,
1128                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1129                                                       _mm_add_ps(one,
1130                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1131                 t1            = _mm_mul_ps(rinv,
1132                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1133                                                       _mm_add_ps(t2, t3)));
1134                 dadx1         = _mm_and_ps(t1, obc_mask1);
1135             }
1136             else
1137             {
1138                 dadx1         = _mm_setzero_ps();
1139             }
1140
1141             /* Evaluate influence of atom ai -> aj */
1142             t1            = _mm_add_ps(r, sk_ai);
1143             obc_mask1     = _mm_cmplt_ps(raj, t1);
1144             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1145
1146             if (_mm_movemask_ps(obc_mask1))
1147             {
1148                 t2            = _mm_sub_ps(r, sk_ai);
1149                 t3            = _mm_sub_ps(sk_ai, r);
1150                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1151                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1152
1153                 uij           = gmx_mm_inv_ps(t1);
1154                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1155                                           _mm_andnot_ps(obc_mask2, raj_inv));
1156                 dlij          = _mm_and_ps(one, obc_mask2);
1157                 uij2          = _mm_mul_ps(uij, uij);
1158                 uij3          = _mm_mul_ps(uij2, uij);
1159                 lij2          = _mm_mul_ps(lij, lij);
1160                 lij3          = _mm_mul_ps(lij2, lij);
1161                 diff2         = _mm_sub_ps(uij2, lij2);
1162                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1163                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1164                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1165                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1166                 t1            = _mm_sub_ps(lij, uij);
1167                 t2            = _mm_mul_ps(diff2,
1168                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1169                                                       prod));
1170                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1171                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1172                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1173                 t4            = _mm_and_ps(t4, obc_mask3);
1174                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1175
1176                 tmp           = _mm_and_ps(t1, obc_mask1);
1177
1178                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1179                                            _mm_mul_ps(prod, lij3));
1180                 t1            = _mm_sub_ps(t1,
1181                                            _mm_mul_ps(onefourth,
1182                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1183                                                                  _mm_mul_ps(lij3, r))));
1184                 t2            = _mm_mul_ps(onefourth,
1185                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1186                                                       _mm_mul_ps(uij3, r)));
1187                 t2            = _mm_sub_ps(t2,
1188                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1189                                                       _mm_mul_ps(prod, uij3)));
1190                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1191                                            _mm_mul_ps(rinv, rinv));
1192                 t3            = _mm_sub_ps(t3,
1193                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1194                                                       _mm_add_ps(one,
1195                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1196                 t1            = _mm_mul_ps(rinv,
1197                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1198                                                       _mm_add_ps(t2, t3)));
1199                 dadx2         = _mm_and_ps(t1, obc_mask1);
1200             }
1201             else
1202             {
1203                 dadx2         = _mm_setzero_ps();
1204                 tmp           = _mm_setzero_ps();
1205             }
1206
1207             _mm_store_ps(dadx, dadx1);
1208             dadx += 4;
1209             _mm_store_ps(dadx, dadx2);
1210             dadx += 4;
1211
1212             if (offset == 1)
1213             {
1214                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1215             }
1216             else if (offset == 2)
1217             {
1218                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1219             }
1220             else
1221             {
1222                 /* offset must be 3 */
1223                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1224             }
1225
1226         }
1227         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1228
1229     }
1230
1231     /* Parallel summations */
1232     if (PARTDECOMP(cr))
1233     {
1234         gmx_sum(natoms, work, cr);
1235     }
1236     else if (DOMAINDECOMP(cr))
1237     {
1238         dd_atom_sum_real(cr->dd, work);
1239     }
1240
1241     if (gb_algorithm == egbHCT)
1242     {
1243         /* HCT */
1244         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1245         {
1246             if (born->use[i] != 0)
1247             {
1248                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1249                 sum     = 1.0/rr - work[i];
1250                 min_rad = rr + doffset;
1251                 rad     = 1.0/sum;
1252
1253                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1254                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1255             }
1256         }
1257
1258         /* Extra communication required for DD */
1259         if (DOMAINDECOMP(cr))
1260         {
1261             dd_atom_spread_real(cr->dd, born->bRad);
1262             dd_atom_spread_real(cr->dd, fr->invsqrta);
1263         }
1264     }
1265     else
1266     {
1267         /* OBC */
1268         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1269         {
1270             if (born->use[i] != 0)
1271             {
1272                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1273                 rr_inv2 = 1.0/rr;
1274                 rr      = rr-doffset;
1275                 rr_inv  = 1.0/rr;
1276                 sum     = rr * work[i];
1277                 sum2    = sum  * sum;
1278                 sum3    = sum2 * sum;
1279
1280                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1281                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1282                 born->bRad[i] = 1.0 / born->bRad[i];
1283
1284                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1285
1286                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1287                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1288             }
1289         }
1290         /* Extra (local) communication required for DD */
1291         if (DOMAINDECOMP(cr))
1292         {
1293             dd_atom_spread_real(cr->dd, born->bRad);
1294             dd_atom_spread_real(cr->dd, fr->invsqrta);
1295             dd_atom_spread_real(cr->dd, born->drobc);
1296         }
1297     }
1298
1299
1300
1301     return 0;
1302 }
1303
1304
1305
1306 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1307                                     float *x, float *f, float *fshift, float *shiftvec,
1308                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1309 {
1310     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1311     int          jnrA, jnrB, jnrC, jnrD;
1312     int          j3A, j3B, j3C, j3D;
1313     int          jnrE, jnrF, jnrG, jnrH;
1314     int          j3E, j3F, j3G, j3H;
1315     int       *  jjnr;
1316
1317     float        rbi, shX, shY, shZ;
1318     float       *rb;
1319
1320     __m128       ix, iy, iz;
1321     __m128       jx, jy, jz;
1322     __m128       jxB, jyB, jzB;
1323     __m128       fix, fiy, fiz;
1324     __m128       dx, dy, dz;
1325     __m128       tx, ty, tz;
1326     __m128       dxB, dyB, dzB;
1327     __m128       txB, tyB, tzB;
1328
1329     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1330     __m128       xmm1, xmm2, xmm3;
1331
1332     const __m128 two = _mm_set1_ps(2.0f);
1333
1334     rb     = born->work;
1335
1336     jjnr   = nl->jjnr;
1337
1338     /* Loop to get the proper form for the Born radius term, sse style */
1339     offset = natoms%4;
1340
1341     n0 = 0;
1342     n1 = natoms;
1343
1344     if (gb_algorithm == egbSTILL)
1345     {
1346         for (i = n0; i < n1; i++)
1347         {
1348             rbi   = born->bRad[i];
1349             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1350         }
1351     }
1352     else if (gb_algorithm == egbHCT)
1353     {
1354         for (i = n0; i < n1; i++)
1355         {
1356             rbi   = born->bRad[i];
1357             rb[i] = rbi * rbi * dvda[i];
1358         }
1359     }
1360     else if (gb_algorithm == egbOBC)
1361     {
1362         for (i = n0; i < n1; i++)
1363         {
1364             rbi   = born->bRad[i];
1365             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1366         }
1367     }
1368
1369     jz = _mm_setzero_ps();
1370
1371     n = j3A = j3B = j3C = j3D = 0;
1372
1373     for (i = 0; i < nl->nri; i++)
1374     {
1375         ii     = nl->iinr[i];
1376         ii3    = ii*3;
1377         is3    = 3*nl->shift[i];
1378         shX    = shiftvec[is3];
1379         shY    = shiftvec[is3+1];
1380         shZ    = shiftvec[is3+2];
1381         nj0    = nl->jindex[i];
1382         nj1    = nl->jindex[i+1];
1383
1384         ix     = _mm_set1_ps(shX+x[ii3+0]);
1385         iy     = _mm_set1_ps(shY+x[ii3+1]);
1386         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1387
1388         offset = (nj1-nj0)%4;
1389
1390         rbai   = _mm_load1_ps(rb+ii);
1391         fix    = _mm_setzero_ps();
1392         fiy    = _mm_setzero_ps();
1393         fiz    = _mm_setzero_ps();
1394
1395
1396         for (k = nj0; k < nj1-offset; k += 4)
1397         {
1398             jnrA        = jjnr[k];
1399             jnrB        = jjnr[k+1];
1400             jnrC        = jjnr[k+2];
1401             jnrD        = jjnr[k+3];
1402
1403             j3A         = 3*jnrA;
1404             j3B         = 3*jnrB;
1405             j3C         = 3*jnrC;
1406             j3D         = 3*jnrD;
1407
1408             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1409
1410             dx          = _mm_sub_ps(ix, jx);
1411             dy          = _mm_sub_ps(iy, jy);
1412             dz          = _mm_sub_ps(iz, jz);
1413
1414             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1415
1416             /* load chain rule terms for j1-4 */
1417             f_gb        = _mm_load_ps(dadx);
1418             dadx       += 4;
1419             f_gb_ai     = _mm_load_ps(dadx);
1420             dadx       += 4;
1421
1422             /* calculate scalar force */
1423             f_gb    = _mm_mul_ps(f_gb, rbai);
1424             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1425             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1426
1427             tx     = _mm_mul_ps(f_gb, dx);
1428             ty     = _mm_mul_ps(f_gb, dy);
1429             tz     = _mm_mul_ps(f_gb, dz);
1430
1431             fix    = _mm_add_ps(fix, tx);
1432             fiy    = _mm_add_ps(fiy, ty);
1433             fiz    = _mm_add_ps(fiz, tz);
1434
1435             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1436         }
1437
1438         /*deal with odd elements */
1439         if (offset != 0)
1440         {
1441             if (offset == 1)
1442             {
1443                 jnrA        = jjnr[k];
1444                 j3A         = 3*jnrA;
1445                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1446                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1447             }
1448             else if (offset == 2)
1449             {
1450                 jnrA        = jjnr[k];
1451                 jnrB        = jjnr[k+1];
1452                 j3A         = 3*jnrA;
1453                 j3B         = 3*jnrB;
1454                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1455                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1456             }
1457             else
1458             {
1459                 /* offset must be 3 */
1460                 jnrA        = jjnr[k];
1461                 jnrB        = jjnr[k+1];
1462                 jnrC        = jjnr[k+2];
1463                 j3A         = 3*jnrA;
1464                 j3B         = 3*jnrB;
1465                 j3C         = 3*jnrC;
1466                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1467                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1468             }
1469
1470             dx          = _mm_sub_ps(ix, jx);
1471             dy          = _mm_sub_ps(iy, jy);
1472             dz          = _mm_sub_ps(iz, jz);
1473
1474             /* load chain rule terms for j1-4 */
1475             f_gb        = _mm_load_ps(dadx);
1476             dadx       += 4;
1477             f_gb_ai     = _mm_load_ps(dadx);
1478             dadx       += 4;
1479
1480             /* calculate scalar force */
1481             f_gb    = _mm_mul_ps(f_gb, rbai);
1482             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1483             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1484
1485             tx     = _mm_mul_ps(f_gb, dx);
1486             ty     = _mm_mul_ps(f_gb, dy);
1487             tz     = _mm_mul_ps(f_gb, dz);
1488
1489             fix    = _mm_add_ps(fix, tx);
1490             fiy    = _mm_add_ps(fiy, ty);
1491             fiz    = _mm_add_ps(fiz, tz);
1492
1493             if (offset == 1)
1494             {
1495                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1496             }
1497             else if (offset == 2)
1498             {
1499                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1500             }
1501             else
1502             {
1503                 /* offset must be 3 */
1504                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1505             }
1506         }
1507
1508         /* fix/fiy/fiz now contain four partial force terms, that all should be
1509          * added to the i particle forces and shift forces.
1510          */
1511         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1512     }
1513
1514     return 0;
1515 }
1516
1517
1518 #else
1519 /* keep compiler happy */
1520 int genborn_sse_dummy;
1521
1522 #endif /* SSE intrinsics available */