Merge release-4-6 (commit 'Ic142a690')
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_single.c
1 /*
2  * 
3  *                This source code is part of
4  * 
5  *                 G   R   O   M   A   C   S
6  * 
7  *          GROningen MAchine for Chemical Simulations
8  * 
9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11  * Copyright (c) 2001-2008, The GROMACS development team,
12  * check out http://www.gromacs.org for more information.
13  
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * as published by the Free Software Foundation; either version 2
17  * of the License, or (at your option) any later version.
18  * 
19  * If you want to redistribute modifications, please consider that
20  * scientific software is very special. Version control is crucial -
21  * bugs must be traceable. We will be happy to consider code for
22  * inclusion in the official distribution, but derived work must not
23  * be called official GROMACS. Details are found in the README & COPYING
24  * files - if they are missing, get the official version at www.gromacs.org.
25  * 
26  * To help us fund GROMACS development, we humbly ask that you cite
27  * the papers on the package - you can find them in the top README file.
28  * 
29  * For more info, check our website at http://www.gromacs.org
30  * 
31  * And Hey:
32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
33  */
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37
38 #include <math.h>
39 #include <string.h>
40
41 #include "typedefs.h"
42 #include "smalloc.h"
43 #include "genborn.h"
44 #include "vec.h"
45 #include "grompp.h"
46 #include "pdbio.h"
47 #include "names.h"
48 #include "physics.h"
49 #include "partdec.h"
50 #include "domdec.h"
51 #include "network.h"
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
54 #include "genborn.h"
55
56 #ifdef GMX_LIB_MPI
57 #include <mpi.h>
58 #endif
59 #ifdef GMX_THREAD_MPI
60 #include "tmpi.h"
61 #endif
62
63
64 /* Only compile this file if SSE intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
66
67 #include <gmx_sse2_single.h>
68 #include <xmmintrin.h>
69 #include <emmintrin.h>
70
71 #include "genborn_sse2_single.h"
72
73
74 int 
75 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
76                               int natoms, gmx_localtop_t *top,
77                               const t_atomtypes *atype, float *x, t_nblist *nl,
78                               gmx_genborn_t *born)
79 {
80         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
81         int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
82         int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
83         int shift;
84     int *mdtype;
85         real shX,shY,shZ;
86     int *jjnr;
87     real *shiftvec;
88
89         float gpi_ai,gpi2;
90         float factor;
91         float *gb_radius;
92     float *vsolv;
93     float *work;
94     float *dadx;
95     
96         __m128 ix,iy,iz;
97         __m128 jx,jy,jz;
98         __m128 dx,dy,dz;
99         __m128 tx,ty,tz;
100         __m128 jxB,jyB,jzB;
101         __m128 dxB,dyB,dzB;
102         __m128 txB,tyB,tzB;
103         __m128 rsq,rinv,rinv2,rinv4,rinv6;
104         __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
105         __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
106         __m128 ratioB,rajB,vajB,rvdwB;
107         __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
108         __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
109         __m128 mask,icf4,icf6,mask_cmp;
110         __m128 icf4B,icf6B,mask_cmpB;
111         
112     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
113         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
114         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
115     
116         const __m128 half   = _mm_set1_ps(0.5f);
117         const __m128 three  = _mm_set1_ps(3.0f);
118         const __m128 one    = _mm_set1_ps(1.0f);
119         const __m128 two    = _mm_set1_ps(2.0f);
120         const __m128 zero   = _mm_set1_ps(0.0f);
121         const __m128 four   = _mm_set1_ps(4.0f);
122         
123         const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
124         const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
125         const __m128 still_p4     = _mm_set1_ps(STILL_P4);
126                 
127         factor  = 0.5 * ONE_4PI_EPS0;
128                 
129     gb_radius = born->gb_radius;
130     vsolv     = born->vsolv;
131     work      = born->gpol_still_work;
132         jjnr      = nl->jjnr;
133     shiftvec  = fr->shift_vec[0];
134     dadx      = fr->dadx;
135     
136         jnrA = jnrB = jnrC = jnrD = 0;
137     jx = _mm_setzero_ps();
138     jy = _mm_setzero_ps();
139     jz = _mm_setzero_ps();
140     
141         n = 0;
142     
143         for(i=0;i<natoms;i++)
144         {
145                 work[i]=0;
146         }
147
148         for(i=0;i<nl->nri;i++)
149         {
150         ii     = nl->iinr[i];
151                 ii3        = ii*3;
152         is3    = 3*nl->shift[i];     
153         shX    = shiftvec[is3];  
154         shY    = shiftvec[is3+1];
155         shZ    = shiftvec[is3+2];
156         nj0    = nl->jindex[i];      
157         nj1    = nl->jindex[i+1];    
158         
159         ix     = _mm_set1_ps(shX+x[ii3+0]);
160                 iy     = _mm_set1_ps(shY+x[ii3+1]);
161                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
162                 
163                 offset = (nj1-nj0)%4;
164                 
165                 /* Polarization energy for atom ai */
166                 gpi    = _mm_setzero_ps();
167                 
168         rai     = _mm_load1_ps(gb_radius+ii);
169         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
170         
171                 for(k=nj0;k<nj1-4-offset;k+=8)
172                 {
173                         jnrA        = jjnr[k];   
174                         jnrB        = jjnr[k+1];
175                         jnrC        = jjnr[k+2];
176                         jnrD        = jjnr[k+3];
177                         jnrE        = jjnr[k+4];   
178                         jnrF        = jjnr[k+5];
179                         jnrG        = jjnr[k+6];
180                         jnrH        = jjnr[k+7];
181             
182             j3A         = 3*jnrA;  
183                         j3B         = 3*jnrB;
184                         j3C         = 3*jnrC;
185                         j3D         = 3*jnrD;
186             j3E         = 3*jnrE;  
187                         j3F         = 3*jnrF;
188                         j3G         = 3*jnrG;
189                         j3H         = 3*jnrH;
190             
191             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
192             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
193             
194             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
195             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
196                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
197                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
198
199                         dx          = _mm_sub_ps(ix,jx);
200                         dy          = _mm_sub_ps(iy,jy);
201                         dz          = _mm_sub_ps(iz,jz);
202                         dxB         = _mm_sub_ps(ix,jxB);
203                         dyB         = _mm_sub_ps(iy,jyB);
204                         dzB         = _mm_sub_ps(iz,jzB);
205             
206             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
207             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
208             rinv        = gmx_mm_invsqrt_ps(rsq);
209             rinvB       = gmx_mm_invsqrt_ps(rsqB);
210             rinv2       = _mm_mul_ps(rinv,rinv);
211             rinv2B      = _mm_mul_ps(rinvB,rinvB);
212             rinv4       = _mm_mul_ps(rinv2,rinv2);
213             rinv4B      = _mm_mul_ps(rinv2B,rinv2B);
214             rinv6       = _mm_mul_ps(rinv4,rinv2);
215             rinv6B      = _mm_mul_ps(rinv4B,rinv2B);
216             
217             rvdw        = _mm_add_ps(rai,raj);
218             rvdwB       = _mm_add_ps(rai,rajB);
219             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
220             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
221
222             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
223             mask_cmpB   = _mm_cmple_ps(ratioB,still_p5inv);
224             
225             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
226             if( 0 == _mm_movemask_ps(mask_cmp) )
227             {
228                 /* if ratio>still_p5inv for ALL elements */
229                 ccf         = one;
230                 dccf        = _mm_setzero_ps();
231             }
232             else 
233             {
234                 ratio       = _mm_min_ps(ratio,still_p5inv);
235                 theta       = _mm_mul_ps(ratio,still_pip5);
236                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
237                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
238                 ccf         = _mm_mul_ps(term,term);
239                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
240                                          _mm_mul_ps(sinq,theta));
241             }
242             if( 0 == _mm_movemask_ps(mask_cmpB) )
243             {
244                 /* if ratio>still_p5inv for ALL elements */
245                 ccfB        = one;
246                 dccfB       = _mm_setzero_ps();
247             }
248             else 
249             {
250                 ratioB      = _mm_min_ps(ratioB,still_p5inv);
251                 thetaB      = _mm_mul_ps(ratioB,still_pip5);
252                 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
253                 termB       = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
254                 ccfB        = _mm_mul_ps(termB,termB);
255                 dccfB       = _mm_mul_ps(_mm_mul_ps(two,termB),
256                                          _mm_mul_ps(sinqB,thetaB));
257             }
258             
259             prod        = _mm_mul_ps(still_p4,vaj);
260             prodB       = _mm_mul_ps(still_p4,vajB);
261             icf4        = _mm_mul_ps(ccf,rinv4);
262             icf4B       = _mm_mul_ps(ccfB,rinv4B);
263             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
264             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
265
266             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
267             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
268             
269             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
270             
271             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
272             dadx+=4;
273             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
274             dadx+=4;
275             _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
276             dadx+=4;
277             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
278             dadx+=4;
279                 } 
280  
281         for(;k<nj1-offset;k+=4)
282                 {
283                         jnrA        = jjnr[k];   
284                         jnrB        = jjnr[k+1];
285                         jnrC        = jjnr[k+2];
286                         jnrD        = jjnr[k+3];
287             
288             j3A         = 3*jnrA;  
289                         j3B         = 3*jnrB;
290                         j3C         = 3*jnrC;
291                         j3D         = 3*jnrD;
292             
293             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
294             
295             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
296                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
297             
298                         dx          = _mm_sub_ps(ix,jx);
299                         dy          = _mm_sub_ps(iy,jy);
300                         dz          = _mm_sub_ps(iz,jz);
301             
302             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
303             rinv        = gmx_mm_invsqrt_ps(rsq);
304             rinv2       = _mm_mul_ps(rinv,rinv);
305             rinv4       = _mm_mul_ps(rinv2,rinv2);
306             rinv6       = _mm_mul_ps(rinv4,rinv2);
307             
308             rvdw        = _mm_add_ps(rai,raj);
309             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
310             
311             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
312
313             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
314             if(0 == _mm_movemask_ps(mask_cmp))
315             {
316                 /* if ratio>still_p5inv for ALL elements */
317                 ccf         = one;
318                 dccf        = _mm_setzero_ps();
319             }
320             else 
321             {
322                 ratio       = _mm_min_ps(ratio,still_p5inv);
323                 theta       = _mm_mul_ps(ratio,still_pip5);
324                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
325                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
326                 ccf         = _mm_mul_ps(term,term);
327                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
328                                          _mm_mul_ps(sinq,theta));
329             }
330             
331             prod        = _mm_mul_ps(still_p4,vaj);
332             icf4        = _mm_mul_ps(ccf,rinv4);
333             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
334
335             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
336             
337             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
338             
339             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
340             dadx+=4;
341             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
342             dadx+=4;
343                 } 
344         
345         if(offset!=0)
346         {
347             if(offset==1)
348             {
349                 jnrA        = jjnr[k];   
350                 j3A         = 3*jnrA;  
351                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
352                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
353                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
354                 mask        = mask1;
355             } 
356             else if(offset==2)
357             {
358                 jnrA        = jjnr[k];   
359                 jnrB        = jjnr[k+1];
360                 j3A         = 3*jnrA;  
361                 j3B         = 3*jnrB;
362                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
363                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
364                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
365                 mask        = mask2;
366             }
367             else
368             {
369                 /* offset must be 3 */   
370                 jnrA        = jjnr[k];   
371                 jnrB        = jjnr[k+1];
372                 jnrC        = jjnr[k+2];
373                 j3A         = 3*jnrA;  
374                 j3B         = 3*jnrB;
375                 j3C         = 3*jnrC;
376                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
377                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
378                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
379                 mask        = mask3;
380             }
381
382                         dx          = _mm_sub_ps(ix,jx);
383                         dy          = _mm_sub_ps(iy,jy);
384                         dz          = _mm_sub_ps(iz,jz);
385             
386             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
387             rinv        = gmx_mm_invsqrt_ps(rsq);
388             rinv2       = _mm_mul_ps(rinv,rinv);
389             rinv4       = _mm_mul_ps(rinv2,rinv2);
390             rinv6       = _mm_mul_ps(rinv4,rinv2);
391             
392             rvdw        = _mm_add_ps(rai,raj);
393             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
394             
395             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
396             
397             if(0 == _mm_movemask_ps(mask_cmp))
398             {
399                 /* if ratio>still_p5inv for ALL elements */
400                 ccf         = one;
401                 dccf        = _mm_setzero_ps();
402             }
403             else 
404             {
405                 ratio       = _mm_min_ps(ratio,still_p5inv);
406                 theta       = _mm_mul_ps(ratio,still_pip5);
407                 gmx_mm_sincos_ps(theta,&sinq,&cosq);            
408                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
409                 ccf         = _mm_mul_ps(term,term);
410                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
411                                          _mm_mul_ps(sinq,theta));
412             }
413
414             prod        = _mm_mul_ps(still_p4,vaj);
415             icf4        = _mm_mul_ps(ccf,rinv4);
416             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
417             
418             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
419             
420             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
421             dadx+=4;
422             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
423             dadx+=4;
424             
425             tmp = _mm_mul_ps(prod_ai,icf4);
426
427             if(offset==1)
428             {
429                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
430             } 
431             else if(offset==2)
432             {
433                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
434             }
435             else
436             {
437                 /* offset must be 3 */
438                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
439             }
440         }
441         GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
442         }
443
444         /* Sum up the polarization energy from other nodes */
445         if(PARTDECOMP(cr))
446         {
447                 gmx_sum(natoms, work, cr);
448         }
449         else if(DOMAINDECOMP(cr))
450         {
451                 dd_atom_sum_real(cr->dd, work);
452         }
453         
454         /* Compute the radii */
455         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
456         {               
457                 if(born->use[i] != 0)
458                 {
459                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
460                         gpi2             = gpi_ai * gpi_ai;
461                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
462                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
463                 }
464         }
465                 
466         /* Extra (local) communication required for DD */
467         if(DOMAINDECOMP(cr))
468         {
469                 dd_atom_spread_real(cr->dd, born->bRad);
470                 dd_atom_spread_real(cr->dd, fr->invsqrta);
471         }
472     
473         return 0;       
474 }
475
476
477 int 
478 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
479                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
480 {
481         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
482     int jnrA,jnrB,jnrC,jnrD;
483     int j3A,j3B,j3C,j3D;
484     int jnrE,jnrF,jnrG,jnrH;
485     int j3E,j3F,j3G,j3H;
486         float shX,shY,shZ;
487         float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
488         float sum_ai2, sum_ai3,tsum,tchain,doffset;
489         float *obc_param;
490     float *gb_radius;
491     float *work;
492     int *  jjnr;
493     float *dadx;
494     float *shiftvec;
495     float min_rad,rad;
496     
497         __m128 ix,iy,iz,jx,jy,jz;
498         __m128 dx,dy,dz,t1,t2,t3,t4;
499         __m128 rsq,rinv,r;
500         __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
501         __m128 uij,lij2,uij2,lij3,uij3,diff2;
502         __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
503         __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
504         __m128 dadx1,dadx2;
505     __m128 logterm;
506         __m128 mask;
507         __m128 obc_mask1,obc_mask2,obc_mask3;
508     __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
509     __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
510         __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
511         __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
512         __m128 lij_invB,sk2_invB,prodB;
513         __m128 sk_ajB,sk2_ajB,sk2_rinvB;
514         __m128 dadx1B,dadx2B;
515     __m128 logtermB;
516     __m128 obc_mask1B,obc_mask2B,obc_mask3B;
517
518     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
519         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
520         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
521         
522     __m128 oneeighth   = _mm_set1_ps(0.125);
523     __m128 onefourth   = _mm_set1_ps(0.25);
524
525         const __m128 half  = _mm_set1_ps(0.5f);
526         const __m128 three = _mm_set1_ps(3.0f);
527         const __m128 one   = _mm_set1_ps(1.0f);
528         const __m128 two   = _mm_set1_ps(2.0f);
529         const __m128 zero  = _mm_set1_ps(0.0f);
530         const __m128 neg   = _mm_set1_ps(-1.0f);
531         
532         /* Set the dielectric offset */
533         doffset   = born->gb_doffset;
534         gb_radius = born->gb_radius;
535     obc_param = born->param;
536     work      = born->gpol_hct_work;
537     jjnr      = nl->jjnr;
538     dadx      = fr->dadx;
539     shiftvec  = fr->shift_vec[0];
540     
541     jx        = _mm_setzero_ps();
542     jy        = _mm_setzero_ps();
543     jz        = _mm_setzero_ps();
544     
545     jnrA = jnrB = jnrC = jnrD = 0;
546     
547         for(i=0;i<born->nr;i++)
548         {
549                 work[i] = 0;
550         }
551         
552         for(i=0;i<nl->nri;i++)
553         {
554         ii     = nl->iinr[i];
555                 ii3        = ii*3;
556         is3    = 3*nl->shift[i];     
557         shX    = shiftvec[is3];  
558         shY    = shiftvec[is3+1];
559         shZ    = shiftvec[is3+2];
560         nj0    = nl->jindex[i];      
561         nj1    = nl->jindex[i+1];    
562         
563         ix     = _mm_set1_ps(shX+x[ii3+0]);
564                 iy     = _mm_set1_ps(shY+x[ii3+1]);
565                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
566                 
567                 offset = (nj1-nj0)%4;
568
569                 rai    = _mm_load1_ps(gb_radius+ii);
570                 rai_inv= gmx_mm_inv_ps(rai);
571                                 
572                 sum_ai = _mm_setzero_ps();
573                 
574                 sk_ai  = _mm_load1_ps(born->param+ii);
575                 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
576                                 
577                 for(k=nj0;k<nj1-4-offset;k+=8)
578                 {
579                         jnrA        = jjnr[k];   
580                         jnrB        = jjnr[k+1];
581                         jnrC        = jjnr[k+2];
582                         jnrD        = jjnr[k+3];
583                         jnrE        = jjnr[k+4];   
584                         jnrF        = jjnr[k+5];
585                         jnrG        = jjnr[k+6];
586                         jnrH        = jjnr[k+7];
587                         
588             j3A         = 3*jnrA;  
589                         j3B         = 3*jnrB;
590                         j3C         = 3*jnrC;
591                         j3D         = 3*jnrD;
592             j3E         = 3*jnrE;  
593                         j3F         = 3*jnrF;
594                         j3G         = 3*jnrG;
595                         j3H         = 3*jnrH;
596             
597             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
598             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
599             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
600             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
601             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
602             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
603                         
604             dx    = _mm_sub_ps(ix, jx);
605                         dy    = _mm_sub_ps(iy, jy);
606                         dz    = _mm_sub_ps(iz, jz);
607             dxB   = _mm_sub_ps(ix, jxB);
608                         dyB   = _mm_sub_ps(iy, jyB);
609                         dzB   = _mm_sub_ps(iz, jzB);
610                         
611             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
612             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
613                                     
614             rinv        = gmx_mm_invsqrt_ps(rsq);
615             r           = _mm_mul_ps(rsq,rinv);
616             rinvB       = gmx_mm_invsqrt_ps(rsqB);
617             rB          = _mm_mul_ps(rsqB,rinvB);
618
619                         /* Compute raj_inv aj1-4 */
620             raj_inv     = gmx_mm_inv_ps(raj);
621             raj_invB    = gmx_mm_inv_ps(rajB);
622
623             /* Evaluate influence of atom aj -> ai */
624             t1            = _mm_add_ps(r,sk_aj);
625             t2            = _mm_sub_ps(r,sk_aj);
626             t3            = _mm_sub_ps(sk_aj,r);
627             t1B           = _mm_add_ps(rB,sk_ajB);
628             t2B           = _mm_sub_ps(rB,sk_ajB);
629             t3B           = _mm_sub_ps(sk_ajB,rB);
630             obc_mask1     = _mm_cmplt_ps(rai, t1);
631             obc_mask2     = _mm_cmplt_ps(rai, t2);
632             obc_mask3     = _mm_cmplt_ps(rai, t3);
633             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
634             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
635             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
636
637             uij           = gmx_mm_inv_ps(t1);
638             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
639                                       _mm_andnot_ps(obc_mask2,rai_inv));
640             dlij          = _mm_and_ps(one,obc_mask2);
641             uij2          = _mm_mul_ps(uij, uij);
642             uij3          = _mm_mul_ps(uij2,uij);
643             lij2          = _mm_mul_ps(lij, lij);
644             lij3          = _mm_mul_ps(lij2,lij);
645
646             uijB          = gmx_mm_inv_ps(t1B);
647             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
648                                       _mm_andnot_ps(obc_mask2B,rai_inv));
649             dlijB         = _mm_and_ps(one,obc_mask2B);
650             uij2B         = _mm_mul_ps(uijB, uijB);
651             uij3B         = _mm_mul_ps(uij2B,uijB);
652             lij2B         = _mm_mul_ps(lijB, lijB);
653             lij3B         = _mm_mul_ps(lij2B,lijB);
654
655             diff2         = _mm_sub_ps(uij2,lij2);
656             lij_inv       = gmx_mm_invsqrt_ps(lij2);
657             sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
658             sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
659             prod          = _mm_mul_ps(onefourth,sk2_rinv);
660
661             diff2B        = _mm_sub_ps(uij2B,lij2B);
662             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
663             sk2_ajB       = _mm_mul_ps(sk_ajB,sk_ajB);
664             sk2_rinvB     = _mm_mul_ps(sk2_ajB,rinvB);
665             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
666
667             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
668             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
669             
670             t1            = _mm_sub_ps(lij,uij);
671             t2            = _mm_mul_ps(diff2,
672                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
673                                                   prod));
674             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
675             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
676             t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
677             t4            = _mm_and_ps(t4,obc_mask3);
678             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
679             
680             t1B           = _mm_sub_ps(lijB,uijB);
681             t2B           = _mm_mul_ps(diff2B,
682                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
683                                                   prodB));
684             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
685             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
686             t4B           = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
687             t4B           = _mm_and_ps(t4B,obc_mask3B);
688             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
689             
690             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
691             
692             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
693                                        _mm_mul_ps(prod,lij3));
694             t1            = _mm_sub_ps(t1,
695                                        _mm_mul_ps(onefourth,
696                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
697                                                              _mm_mul_ps(lij3,r))));
698             t2            = _mm_mul_ps(onefourth,
699                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
700                                                   _mm_mul_ps(uij3,r)));
701             t2            = _mm_sub_ps(t2,
702                                        _mm_add_ps(_mm_mul_ps(half,uij2),
703                                                       _mm_mul_ps(prod,uij3)));
704             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
705                                        _mm_mul_ps(rinv,rinv));
706             t3            = _mm_sub_ps(t3,
707                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
708                                                   _mm_add_ps(one,
709                                                              _mm_mul_ps(sk2_rinv,rinv))));
710             t1            = _mm_mul_ps(rinv,
711                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
712                                                   _mm_add_ps(t2,t3)));
713             
714
715             
716             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
717                                        _mm_mul_ps(prodB,lij3B));
718             t1B           = _mm_sub_ps(t1B,
719                                        _mm_mul_ps(onefourth,
720                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
721                                                              _mm_mul_ps(lij3B,rB))));
722             t2B           = _mm_mul_ps(onefourth,
723                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
724                                                   _mm_mul_ps(uij3B,rB)));
725             t2B           = _mm_sub_ps(t2B,
726                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
727                                                   _mm_mul_ps(prodB,uij3B)));
728             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
729                                        _mm_mul_ps(rinvB,rinvB));
730             t3B           = _mm_sub_ps(t3B,
731                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
732                                                   _mm_add_ps(one,
733                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
734             t1B           = _mm_mul_ps(rinvB,
735                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
736                                                   _mm_add_ps(t2B,t3B)));
737             
738             dadx1         = _mm_and_ps(t1,obc_mask1);
739             dadx1B        = _mm_and_ps(t1B,obc_mask1B);
740
741
742             /* Evaluate influence of atom ai -> aj */
743             t1            = _mm_add_ps(r,sk_ai);
744             t2            = _mm_sub_ps(r,sk_ai);
745             t3            = _mm_sub_ps(sk_ai,r);
746             t1B           = _mm_add_ps(rB,sk_ai);
747             t2B           = _mm_sub_ps(rB,sk_ai);
748             t3B           = _mm_sub_ps(sk_ai,rB);
749             obc_mask1     = _mm_cmplt_ps(raj, t1);
750             obc_mask2     = _mm_cmplt_ps(raj, t2);
751             obc_mask3     = _mm_cmplt_ps(raj, t3);
752             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
753             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
754             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
755             
756             uij           = gmx_mm_inv_ps(t1);
757             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
758                                       _mm_andnot_ps(obc_mask2,raj_inv));
759             dlij          = _mm_and_ps(one,obc_mask2);
760             uij2          = _mm_mul_ps(uij, uij);
761             uij3          = _mm_mul_ps(uij2,uij);
762             lij2          = _mm_mul_ps(lij, lij);
763             lij3          = _mm_mul_ps(lij2,lij);
764
765             uijB          = gmx_mm_inv_ps(t1B);
766             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
767                                       _mm_andnot_ps(obc_mask2B,raj_invB));
768             dlijB         = _mm_and_ps(one,obc_mask2B);
769             uij2B         = _mm_mul_ps(uijB, uijB);
770             uij3B         = _mm_mul_ps(uij2B,uijB);
771             lij2B         = _mm_mul_ps(lijB, lijB);
772             lij3B         = _mm_mul_ps(lij2B,lijB);
773
774             diff2         = _mm_sub_ps(uij2,lij2);
775             lij_inv       = gmx_mm_invsqrt_ps(lij2);
776             sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
777             prod          = _mm_mul_ps(onefourth,sk2_rinv);
778
779             diff2B        = _mm_sub_ps(uij2B,lij2B);
780             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
781             sk2_rinvB     = _mm_mul_ps(sk2_ai,rinvB);
782             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
783
784             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
785             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
786
787             t1            = _mm_sub_ps(lij,uij);
788             t2            = _mm_mul_ps(diff2,
789                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
790                                                   prod));
791             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
792             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
793             t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
794             t4            = _mm_and_ps(t4,obc_mask3);
795             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
796
797             t1B           = _mm_sub_ps(lijB,uijB);
798             t2B           = _mm_mul_ps(diff2B,
799                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
800                                                   prodB));
801             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
802             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
803             t4B           = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
804             t4B           = _mm_and_ps(t4B,obc_mask3B);
805             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
806             
807             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
808             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
809             
810             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
811                                        _mm_mul_ps(prod,lij3));
812             t1            = _mm_sub_ps(t1,
813                                        _mm_mul_ps(onefourth,
814                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
815                                                              _mm_mul_ps(lij3,r))));
816             t2            = _mm_mul_ps(onefourth,
817                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
818                                                   _mm_mul_ps(uij3,r)));
819             t2            = _mm_sub_ps(t2,
820                                        _mm_add_ps(_mm_mul_ps(half,uij2),
821                                                   _mm_mul_ps(prod,uij3)));
822             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
823                                        _mm_mul_ps(rinv,rinv));
824             t3            = _mm_sub_ps(t3,
825                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
826                                                   _mm_add_ps(one,
827                                                              _mm_mul_ps(sk2_rinv,rinv))));
828             t1            = _mm_mul_ps(rinv,
829                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
830                                                   _mm_add_ps(t2,t3)));
831
832             
833             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
834                                        _mm_mul_ps(prodB,lij3B));
835             t1B           = _mm_sub_ps(t1B,
836                                        _mm_mul_ps(onefourth,
837                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
838                                                              _mm_mul_ps(lij3B,rB))));
839             t2B           = _mm_mul_ps(onefourth,
840                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
841                                                   _mm_mul_ps(uij3B,rB)));
842             t2B           = _mm_sub_ps(t2B,
843                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
844                                                   _mm_mul_ps(prodB,uij3B)));
845             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
846                                        _mm_mul_ps(rinvB,rinvB));
847             t3B           = _mm_sub_ps(t3B,
848                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
849                                                   _mm_add_ps(one,
850                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
851             t1B           = _mm_mul_ps(rinvB,
852                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
853                                                   _mm_add_ps(t2B,t3B)));
854
855             
856             dadx2         = _mm_and_ps(t1,obc_mask1);
857             dadx2B        = _mm_and_ps(t1B,obc_mask1B);
858             
859             _mm_store_ps(dadx,dadx1);
860             dadx += 4;
861             _mm_store_ps(dadx,dadx2);
862             dadx += 4;
863             _mm_store_ps(dadx,dadx1B);
864             dadx += 4;
865             _mm_store_ps(dadx,dadx2B);
866             dadx += 4;
867             
868         } /* end normal inner loop */
869         
870                 for(;k<nj1-offset;k+=4)
871                 {
872                         jnrA        = jjnr[k];   
873                         jnrB        = jjnr[k+1];
874                         jnrC        = jjnr[k+2];
875                         jnrD        = jjnr[k+3];
876                         
877             j3A         = 3*jnrA;  
878                         j3B         = 3*jnrB;
879                         j3C         = 3*jnrC;
880                         j3D         = 3*jnrD;
881             
882             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
883             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
884             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
885                         
886             dx    = _mm_sub_ps(ix, jx);
887                         dy    = _mm_sub_ps(iy, jy);
888                         dz    = _mm_sub_ps(iz, jz);
889                         
890             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
891             
892             rinv        = gmx_mm_invsqrt_ps(rsq);
893             r           = _mm_mul_ps(rsq,rinv);
894             
895                         /* Compute raj_inv aj1-4 */
896             raj_inv     = gmx_mm_inv_ps(raj);
897             
898             /* Evaluate influence of atom aj -> ai */
899             t1            = _mm_add_ps(r,sk_aj);
900             obc_mask1     = _mm_cmplt_ps(rai, t1);
901             
902             if(_mm_movemask_ps(obc_mask1))
903             {
904                 /* If any of the elements has rai<dr+sk, this is executed */
905                 t2            = _mm_sub_ps(r,sk_aj);
906                 t3            = _mm_sub_ps(sk_aj,r);
907                 
908                 obc_mask2     = _mm_cmplt_ps(rai, t2);
909                 obc_mask3     = _mm_cmplt_ps(rai, t3);
910                 
911                 uij           = gmx_mm_inv_ps(t1);
912                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
913                                           _mm_andnot_ps(obc_mask2,rai_inv));
914                 dlij          = _mm_and_ps(one,obc_mask2);
915                 uij2          = _mm_mul_ps(uij, uij);
916                 uij3          = _mm_mul_ps(uij2,uij);
917                 lij2          = _mm_mul_ps(lij, lij);
918                 lij3          = _mm_mul_ps(lij2,lij);
919                 diff2         = _mm_sub_ps(uij2,lij2);
920                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
921                 sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
922                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
923                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
924                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
925                 t1            = _mm_sub_ps(lij,uij);
926                 t2            = _mm_mul_ps(diff2,
927                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
928                                                       prod));
929                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
930                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
931                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
932                 t4            = _mm_and_ps(t4,obc_mask3);
933                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
934                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
935                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
936                                            _mm_mul_ps(prod,lij3));
937                 t1            = _mm_sub_ps(t1,
938                                            _mm_mul_ps(onefourth,
939                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
940                                                                  _mm_mul_ps(lij3,r))));
941                 t2            = _mm_mul_ps(onefourth,
942                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
943                                                       _mm_mul_ps(uij3,r)));
944                 t2            = _mm_sub_ps(t2,
945                                            _mm_add_ps(_mm_mul_ps(half,uij2),
946                                                       _mm_mul_ps(prod,uij3)));
947                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
948                                            _mm_mul_ps(rinv,rinv));
949                 t3            = _mm_sub_ps(t3,
950                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
951                                                       _mm_add_ps(one,
952                                                                  _mm_mul_ps(sk2_rinv,rinv))));
953                 t1            = _mm_mul_ps(rinv,
954                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
955                                                       _mm_add_ps(t2,t3)));
956                 
957                 dadx1         = _mm_and_ps(t1,obc_mask1);
958             }
959             else 
960             {
961                 dadx1         = _mm_setzero_ps();
962             }
963             
964             /* Evaluate influence of atom ai -> aj */
965             t1            = _mm_add_ps(r,sk_ai);
966             obc_mask1     = _mm_cmplt_ps(raj, t1);
967             
968             if(_mm_movemask_ps(obc_mask1))
969             {
970                 t2            = _mm_sub_ps(r,sk_ai);
971                 t3            = _mm_sub_ps(sk_ai,r);
972                 obc_mask2     = _mm_cmplt_ps(raj, t2);
973                 obc_mask3     = _mm_cmplt_ps(raj, t3);
974                 
975                 uij           = gmx_mm_inv_ps(t1);
976                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
977                                           _mm_andnot_ps(obc_mask2,raj_inv));
978                 dlij          = _mm_and_ps(one,obc_mask2);
979                 uij2          = _mm_mul_ps(uij, uij);
980                 uij3          = _mm_mul_ps(uij2,uij);
981                 lij2          = _mm_mul_ps(lij, lij);
982                 lij3          = _mm_mul_ps(lij2,lij);
983                 diff2         = _mm_sub_ps(uij2,lij2);
984                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
985                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
986                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
987                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
988                 t1            = _mm_sub_ps(lij,uij);
989                 t2            = _mm_mul_ps(diff2,
990                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
991                                                       prod));
992                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
993                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
994                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
995                 t4            = _mm_and_ps(t4,obc_mask3);
996                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
997                 
998                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
999                 
1000                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1001                                            _mm_mul_ps(prod,lij3));
1002                 t1            = _mm_sub_ps(t1,
1003                                            _mm_mul_ps(onefourth,
1004                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1005                                                                  _mm_mul_ps(lij3,r))));
1006                 t2            = _mm_mul_ps(onefourth,
1007                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1008                                                       _mm_mul_ps(uij3,r)));
1009                 t2            = _mm_sub_ps(t2,
1010                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1011                                                       _mm_mul_ps(prod,uij3)));
1012                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1013                                            _mm_mul_ps(rinv,rinv));
1014                 t3            = _mm_sub_ps(t3,
1015                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1016                                                       _mm_add_ps(one,
1017                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1018                 t1            = _mm_mul_ps(rinv,
1019                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1020                                                       _mm_add_ps(t2,t3)));
1021                 dadx2         = _mm_and_ps(t1,obc_mask1);
1022             }
1023             else
1024             {
1025                 dadx2         = _mm_setzero_ps();
1026             }
1027             
1028             _mm_store_ps(dadx,dadx1);
1029             dadx += 4;
1030             _mm_store_ps(dadx,dadx2);
1031             dadx += 4;            
1032         } /* end normal inner loop */
1033         
1034         if(offset!=0)
1035         {
1036             if(offset==1)
1037             {
1038                 jnrA        = jjnr[k];   
1039                 j3A         = 3*jnrA;  
1040                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1041                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1042                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1043                 mask        = mask1;
1044             } 
1045             else if(offset==2)
1046             {
1047                 jnrA        = jjnr[k];   
1048                 jnrB        = jjnr[k+1];
1049                 j3A         = 3*jnrA;  
1050                 j3B         = 3*jnrB;
1051                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1052                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1053                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1054                 mask        = mask2;
1055             }
1056             else
1057             {
1058                 /* offset must be 3 */   
1059                 jnrA        = jjnr[k];   
1060                 jnrB        = jjnr[k+1];
1061                 jnrC        = jjnr[k+2];
1062                 j3A         = 3*jnrA;  
1063                 j3B         = 3*jnrB;
1064                 j3C         = 3*jnrC;
1065                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1066                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1067                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1068                 mask        = mask3;
1069             }
1070
1071             dx    = _mm_sub_ps(ix, jx);
1072                         dy    = _mm_sub_ps(iy, jy);
1073                         dz    = _mm_sub_ps(iz, jz);
1074                         
1075             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
1076             
1077             rinv        = gmx_mm_invsqrt_ps(rsq);
1078             r           = _mm_mul_ps(rsq,rinv);
1079             
1080                         /* Compute raj_inv aj1-4 */
1081             raj_inv     = gmx_mm_inv_ps(raj);
1082             
1083             /* Evaluate influence of atom aj -> ai */
1084             t1            = _mm_add_ps(r,sk_aj);
1085             obc_mask1     = _mm_cmplt_ps(rai, t1);
1086             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1087
1088             if(_mm_movemask_ps(obc_mask1))
1089             {
1090                 t2            = _mm_sub_ps(r,sk_aj);
1091                 t3            = _mm_sub_ps(sk_aj,r);
1092                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1093                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1094                 
1095                 uij           = gmx_mm_inv_ps(t1);
1096                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1097                                           _mm_andnot_ps(obc_mask2,rai_inv));
1098                 dlij          = _mm_and_ps(one,obc_mask2);
1099                 uij2          = _mm_mul_ps(uij, uij);
1100                 uij3          = _mm_mul_ps(uij2,uij);
1101                 lij2          = _mm_mul_ps(lij, lij);
1102                 lij3          = _mm_mul_ps(lij2,lij);
1103                 diff2         = _mm_sub_ps(uij2,lij2);
1104                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1105                 sk2_aj         = _mm_mul_ps(sk_aj,sk_aj);
1106                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
1107                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1108                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1109                 t1            = _mm_sub_ps(lij,uij);
1110                 t2            = _mm_mul_ps(diff2,
1111                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1112                                                       prod));
1113                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1114                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1115                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1116                 t4            = _mm_and_ps(t4,obc_mask3);
1117                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1118                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1119                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1120                                            _mm_mul_ps(prod,lij3));
1121                 t1            = _mm_sub_ps(t1,
1122                                            _mm_mul_ps(onefourth,
1123                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1124                                                                  _mm_mul_ps(lij3,r))));
1125                 t2            = _mm_mul_ps(onefourth,
1126                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1127                                                       _mm_mul_ps(uij3,r)));
1128                 t2            = _mm_sub_ps(t2,
1129                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1130                                                       _mm_mul_ps(prod,uij3)));
1131                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1132                                            _mm_mul_ps(rinv,rinv));
1133                 t3            = _mm_sub_ps(t3,
1134                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1135                                                       _mm_add_ps(one,
1136                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1137                 t1            = _mm_mul_ps(rinv,
1138                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1139                                                       _mm_add_ps(t2,t3)));
1140                 dadx1         = _mm_and_ps(t1,obc_mask1);
1141             }
1142             else
1143             {
1144                 dadx1         = _mm_setzero_ps();
1145             }
1146
1147                 /* Evaluate influence of atom ai -> aj */
1148             t1            = _mm_add_ps(r,sk_ai);
1149             obc_mask1     = _mm_cmplt_ps(raj, t1);
1150             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1151             
1152             if(_mm_movemask_ps(obc_mask1))
1153             {
1154                 t2            = _mm_sub_ps(r,sk_ai);
1155                 t3            = _mm_sub_ps(sk_ai,r);
1156                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1157                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1158             
1159                 uij           = gmx_mm_inv_ps(t1);
1160                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1161                                           _mm_andnot_ps(obc_mask2,raj_inv));
1162                 dlij          = _mm_and_ps(one,obc_mask2);
1163                 uij2          = _mm_mul_ps(uij, uij);
1164                 uij3          = _mm_mul_ps(uij2,uij);
1165                 lij2          = _mm_mul_ps(lij, lij);
1166                 lij3          = _mm_mul_ps(lij2,lij);
1167                 diff2         = _mm_sub_ps(uij2,lij2);
1168                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1169                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
1170                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1171                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1172                 t1            = _mm_sub_ps(lij,uij);
1173                 t2            = _mm_mul_ps(diff2,
1174                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1175                                                       prod));
1176                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1177                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1178                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1179                 t4            = _mm_and_ps(t4,obc_mask3);
1180                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1181                 
1182                 tmp           = _mm_and_ps(t1,obc_mask1);
1183                 
1184                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1185                                            _mm_mul_ps(prod,lij3));
1186                 t1            = _mm_sub_ps(t1,
1187                                            _mm_mul_ps(onefourth,
1188                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1189                                                                  _mm_mul_ps(lij3,r))));
1190                 t2            = _mm_mul_ps(onefourth,
1191                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1192                                                       _mm_mul_ps(uij3,r)));
1193                 t2            = _mm_sub_ps(t2,
1194                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1195                                                       _mm_mul_ps(prod,uij3)));
1196                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1197                                            _mm_mul_ps(rinv,rinv));
1198                 t3            = _mm_sub_ps(t3,
1199                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1200                                                       _mm_add_ps(one,
1201                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1202                 t1            = _mm_mul_ps(rinv,
1203                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1204                                                       _mm_add_ps(t2,t3)));
1205                 dadx2         = _mm_and_ps(t1,obc_mask1);
1206             }
1207             else
1208             {
1209                 dadx2         = _mm_setzero_ps();
1210                 tmp           = _mm_setzero_ps();
1211             }
1212             
1213             _mm_store_ps(dadx,dadx1);
1214             dadx += 4;
1215             _mm_store_ps(dadx,dadx2);
1216             dadx += 4;
1217             
1218             if(offset==1)
1219             {
1220                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1221             } 
1222             else if(offset==2)
1223             {
1224                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1225             }
1226             else
1227             {
1228                 /* offset must be 3 */
1229                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1230             }
1231             
1232         }
1233         GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1234         
1235         }
1236         
1237         /* Parallel summations */
1238         if(PARTDECOMP(cr))
1239         {
1240                 gmx_sum(natoms, work, cr);
1241         }
1242         else if(DOMAINDECOMP(cr))
1243         {
1244                 dd_atom_sum_real(cr->dd, work);
1245         }
1246         
1247     if(gb_algorithm==egbHCT)
1248     {
1249         /* HCT */
1250         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1251         {
1252                         if(born->use[i] != 0)
1253             {
1254                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
1255                 sum     = 1.0/rr - work[i];
1256                 min_rad = rr + doffset;
1257                 rad     = 1.0/sum; 
1258                 
1259                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1260                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1261             }
1262         }
1263         
1264         /* Extra communication required for DD */
1265         if(DOMAINDECOMP(cr))
1266         {
1267             dd_atom_spread_real(cr->dd, born->bRad);
1268             dd_atom_spread_real(cr->dd, fr->invsqrta);
1269         }
1270     }
1271     else
1272     {
1273         /* OBC */
1274         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1275         {
1276                         if(born->use[i] != 0)
1277             {
1278                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1279                 rr_inv2 = 1.0/rr;
1280                 rr      = rr-doffset; 
1281                 rr_inv  = 1.0/rr;
1282                 sum     = rr * work[i];
1283                 sum2    = sum  * sum;
1284                 sum3    = sum2 * sum;
1285                 
1286                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1287                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1288                 born->bRad[i] = 1.0 / born->bRad[i];
1289                 
1290                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1291                 
1292                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1293                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1294             }
1295         }
1296         /* Extra (local) communication required for DD */
1297         if(DOMAINDECOMP(cr))
1298         {
1299             dd_atom_spread_real(cr->dd, born->bRad);
1300             dd_atom_spread_real(cr->dd, fr->invsqrta);
1301             dd_atom_spread_real(cr->dd, born->drobc);
1302         }
1303     }
1304
1305         
1306         
1307         return 0;
1308 }
1309
1310
1311
1312 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda, 
1313                                     float *x, float *f, float *fshift, float *shiftvec,
1314                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                               
1315 {
1316         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1317         int        jnrA,jnrB,jnrC,jnrD;
1318     int    j3A,j3B,j3C,j3D;
1319         int        jnrE,jnrF,jnrG,jnrH;
1320     int    j3E,j3F,j3G,j3H;
1321         int *  jjnr;
1322     
1323         float   rbi,shX,shY,shZ;
1324         float   *rb;
1325     
1326         __m128 ix,iy,iz;
1327         __m128 jx,jy,jz;
1328         __m128 jxB,jyB,jzB;
1329         __m128 fix,fiy,fiz;
1330         __m128 dx,dy,dz;
1331     __m128 tx,ty,tz;
1332         __m128 dxB,dyB,dzB;
1333     __m128 txB,tyB,tzB;
1334
1335         __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1336         __m128 xmm1,xmm2,xmm3;
1337         
1338         const __m128 two = _mm_set1_ps(2.0f);
1339     
1340         rb     = born->work; 
1341                         
1342     jjnr   = nl->jjnr;
1343     
1344         /* Loop to get the proper form for the Born radius term, sse style */
1345         offset=natoms%4;
1346         
1347   n0 = 0;
1348   n1 = natoms;
1349   
1350         if(gb_algorithm==egbSTILL) 
1351         {
1352                 for(i=n0;i<n1;i++)
1353                 {
1354       rbi   = born->bRad[i];
1355                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1356                 }
1357         }
1358         else if(gb_algorithm==egbHCT) 
1359         {
1360                 for(i=n0;i<n1;i++)
1361                 {
1362       rbi   = born->bRad[i];
1363                         rb[i] = rbi * rbi * dvda[i];
1364                 }
1365         }
1366         else if(gb_algorithm==egbOBC) 
1367         {
1368                 for(i=n0;i<n1;i++)
1369                 {
1370       rbi   = born->bRad[i];
1371                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1372                 }
1373         }
1374     
1375     jz = _mm_setzero_ps();
1376     
1377     n = j3A = j3B = j3C = j3D = 0;
1378     
1379         for(i=0;i<nl->nri;i++)
1380         {
1381         ii     = nl->iinr[i];
1382                 ii3        = ii*3;
1383         is3    = 3*nl->shift[i];     
1384         shX    = shiftvec[is3];  
1385         shY    = shiftvec[is3+1];
1386         shZ    = shiftvec[is3+2];
1387         nj0    = nl->jindex[i];      
1388         nj1    = nl->jindex[i+1];    
1389
1390         ix     = _mm_set1_ps(shX+x[ii3+0]);
1391                 iy     = _mm_set1_ps(shY+x[ii3+1]);
1392                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
1393                 
1394                 offset = (nj1-nj0)%4;
1395                 
1396                 rbai   = _mm_load1_ps(rb+ii);                   
1397                 fix    = _mm_setzero_ps();
1398                 fiy    = _mm_setzero_ps();
1399                 fiz    = _mm_setzero_ps();      
1400                                 
1401
1402         for(k=nj0;k<nj1-offset;k+=4)
1403                 {
1404                         jnrA        = jjnr[k];   
1405                         jnrB        = jjnr[k+1];
1406                         jnrC        = jjnr[k+2];
1407                         jnrD        = jjnr[k+3];
1408             
1409             j3A         = 3*jnrA;  
1410                         j3B         = 3*jnrB;
1411                         j3C         = 3*jnrC;
1412                         j3D         = 3*jnrD;
1413             
1414             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1415             
1416                         dx          = _mm_sub_ps(ix,jx);
1417                         dy          = _mm_sub_ps(iy,jy);
1418                         dz          = _mm_sub_ps(iz,jz);
1419             
1420             GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1421             
1422                         /* load chain rule terms for j1-4 */
1423                         f_gb        = _mm_load_ps(dadx);
1424                         dadx += 4;
1425                         f_gb_ai     = _mm_load_ps(dadx);
1426                         dadx += 4;
1427                         
1428             /* calculate scalar force */
1429             f_gb    = _mm_mul_ps(f_gb,rbai); 
1430             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1431             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1432             
1433             tx     = _mm_mul_ps(f_gb,dx);
1434             ty     = _mm_mul_ps(f_gb,dy);
1435             tz     = _mm_mul_ps(f_gb,dz);
1436             
1437             fix    = _mm_add_ps(fix,tx);
1438             fiy    = _mm_add_ps(fiy,ty);
1439             fiz    = _mm_add_ps(fiz,tz);
1440             
1441             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1442                 }
1443         
1444                 /*deal with odd elements */
1445                 if(offset!=0) 
1446         {
1447             if(offset==1)
1448             {
1449                 jnrA        = jjnr[k];   
1450                 j3A         = 3*jnrA; 
1451                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1452                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1453             } 
1454             else if(offset==2)
1455             {
1456                 jnrA        = jjnr[k];   
1457                 jnrB        = jjnr[k+1];
1458                 j3A         = 3*jnrA;  
1459                 j3B         = 3*jnrB;
1460                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1461                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1462             }
1463             else
1464             {
1465                 /* offset must be 3 */   
1466                 jnrA        = jjnr[k];   
1467                 jnrB        = jjnr[k+1];
1468                 jnrC        = jjnr[k+2];
1469                 j3A         = 3*jnrA;  
1470                 j3B         = 3*jnrB;
1471                 j3C         = 3*jnrC;
1472                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1473                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1474             }
1475             
1476             dx          = _mm_sub_ps(ix,jx);
1477             dy          = _mm_sub_ps(iy,jy);
1478             dz          = _mm_sub_ps(iz,jz);
1479             
1480             /* load chain rule terms for j1-4 */
1481             f_gb        = _mm_load_ps(dadx);
1482             dadx += 4;
1483             f_gb_ai     = _mm_load_ps(dadx);
1484             dadx += 4;
1485             
1486             /* calculate scalar force */
1487             f_gb    = _mm_mul_ps(f_gb,rbai); 
1488             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1489             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1490             
1491             tx     = _mm_mul_ps(f_gb,dx);
1492             ty     = _mm_mul_ps(f_gb,dy);
1493             tz     = _mm_mul_ps(f_gb,dz);
1494             
1495             fix    = _mm_add_ps(fix,tx);
1496             fiy    = _mm_add_ps(fiy,ty);
1497             fiz    = _mm_add_ps(fiz,tz);
1498             
1499             if(offset==1)
1500             {
1501                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1502             } 
1503             else if(offset==2)
1504             {
1505                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1506             }
1507             else
1508             {
1509                 /* offset must be 3 */
1510                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1511             }
1512         } 
1513         
1514                 /* fix/fiy/fiz now contain four partial force terms, that all should be
1515          * added to the i particle forces and shift forces. 
1516          */
1517                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1518         }       
1519     
1520         return 0;       
1521 }
1522
1523
1524 #else
1525 /* keep compiler happy */
1526 int genborn_sse_dummy;
1527
1528 #endif /* SSE intrinsics available */