Merge origin/release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_single.c
1 /*
2  * 
3  *                This source code is part of
4  * 
5  *                 G   R   O   M   A   C   S
6  * 
7  *          GROningen MAchine for Chemical Simulations
8  * 
9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11  * Copyright (c) 2001-2008, The GROMACS development team,
12  * check out http://www.gromacs.org for more information.
13  
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * as published by the Free Software Foundation; either version 2
17  * of the License, or (at your option) any later version.
18  * 
19  * If you want to redistribute modifications, please consider that
20  * scientific software is very special. Version control is crucial -
21  * bugs must be traceable. We will be happy to consider code for
22  * inclusion in the official distribution, but derived work must not
23  * be called official GROMACS. Details are found in the README & COPYING
24  * files - if they are missing, get the official version at www.gromacs.org.
25  * 
26  * To help us fund GROMACS development, we humbly ask that you cite
27  * the papers on the package - you can find them in the top README file.
28  * 
29  * For more info, check our website at http://www.gromacs.org
30  * 
31  * And Hey:
32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
33  */
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37
38 #include <math.h>
39 #include <string.h>
40
41 #include "typedefs.h"
42 #include "smalloc.h"
43 #include "genborn.h"
44 #include "vec.h"
45 #include "grompp.h"
46 #include "pdbio.h"
47 #include "names.h"
48 #include "physics.h"
49 #include "partdec.h"
50 #include "domdec.h"
51 #include "network.h"
52 #include "gmx_fatal.h"
53 #include "mtop_util.h"
54 #include "genborn.h"
55
56 #ifdef GMX_LIB_MPI
57 #include <mpi.h>
58 #endif
59 #ifdef GMX_THREAD_MPI
60 #include "tmpi.h"
61 #endif
62
63
64 /* Only compile this file if SSE intrinsics are available */
65 #if 0 && defined (GMX_X86_SSE2)
66
67 #include <gmx_sse2_single.h>
68 #include <emmintrin.h>
69
70 #include "genborn_sse2_single.h"
71
72
73 int 
74 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
75                               int natoms, gmx_localtop_t *top,
76                               const t_atomtypes *atype, float *x, t_nblist *nl,
77                               gmx_genborn_t *born)
78 {
79         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
80         int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
81         int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
82         int shift;
83     int *mdtype;
84         real shX,shY,shZ;
85     int *jjnr;
86     real *shiftvec;
87
88         float gpi_ai,gpi2;
89         float factor;
90         float *gb_radius;
91     float *vsolv;
92     float *work;
93     float *dadx;
94     
95         __m128 ix,iy,iz;
96         __m128 jx,jy,jz;
97         __m128 dx,dy,dz;
98         __m128 tx,ty,tz;
99         __m128 jxB,jyB,jzB;
100         __m128 dxB,dyB,dzB;
101         __m128 txB,tyB,tzB;
102         __m128 rsq,rinv,rinv2,rinv4,rinv6;
103         __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
104         __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
105         __m128 ratioB,rajB,vajB,rvdwB;
106         __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
107         __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
108         __m128 mask,icf4,icf6,mask_cmp;
109         __m128 icf4B,icf6B,mask_cmpB;
110         
111     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
112         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
113         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
114     
115         const __m128 half   = _mm_set1_ps(0.5f);
116         const __m128 three  = _mm_set1_ps(3.0f);
117         const __m128 one    = _mm_set1_ps(1.0f);
118         const __m128 two    = _mm_set1_ps(2.0f);
119         const __m128 zero   = _mm_set1_ps(0.0f);
120         const __m128 four   = _mm_set1_ps(4.0f);
121         
122         const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
123         const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
124         const __m128 still_p4     = _mm_set1_ps(STILL_P4);
125                 
126         factor  = 0.5 * ONE_4PI_EPS0;
127                 
128     gb_radius = born->gb_radius;
129     vsolv     = born->vsolv;
130     work      = born->gpol_still_work;
131         jjnr      = nl->jjnr;
132     shiftvec  = fr->shift_vec[0];
133     dadx      = fr->dadx;
134     
135         jnrA = jnrB = jnrC = jnrD = 0;
136     jx = _mm_setzero_ps();
137     jy = _mm_setzero_ps();
138     jz = _mm_setzero_ps();
139     
140         n = 0;
141     
142         for(i=0;i<natoms;i++)
143         {
144                 work[i]=0;
145         }
146
147         for(i=0;i<nl->nri;i++)
148         {
149         ii     = nl->iinr[i];
150                 ii3        = ii*3;
151         is3    = 3*nl->shift[i];     
152         shX    = shiftvec[is3];  
153         shY    = shiftvec[is3+1];
154         shZ    = shiftvec[is3+2];
155         nj0    = nl->jindex[i];      
156         nj1    = nl->jindex[i+1];    
157         
158         ix     = _mm_set1_ps(shX+x[ii3+0]);
159                 iy     = _mm_set1_ps(shY+x[ii3+1]);
160                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
161                 
162                 offset = (nj1-nj0)%4;
163                 
164                 /* Polarization energy for atom ai */
165                 gpi    = _mm_setzero_ps();
166                 
167         rai     = _mm_load1_ps(gb_radius+ii);
168         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
169         
170                 for(k=nj0;k<nj1-4-offset;k+=8)
171                 {
172                         jnrA        = jjnr[k];   
173                         jnrB        = jjnr[k+1];
174                         jnrC        = jjnr[k+2];
175                         jnrD        = jjnr[k+3];
176                         jnrE        = jjnr[k+4];   
177                         jnrF        = jjnr[k+5];
178                         jnrG        = jjnr[k+6];
179                         jnrH        = jjnr[k+7];
180             
181             j3A         = 3*jnrA;  
182                         j3B         = 3*jnrB;
183                         j3C         = 3*jnrC;
184                         j3D         = 3*jnrD;
185             j3E         = 3*jnrE;  
186                         j3F         = 3*jnrF;
187                         j3G         = 3*jnrG;
188                         j3H         = 3*jnrH;
189             
190             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
191             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
192             
193             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
194             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
195                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
196                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
197
198                         dx          = _mm_sub_ps(ix,jx);
199                         dy          = _mm_sub_ps(iy,jy);
200                         dz          = _mm_sub_ps(iz,jz);
201                         dxB         = _mm_sub_ps(ix,jxB);
202                         dyB         = _mm_sub_ps(iy,jyB);
203                         dzB         = _mm_sub_ps(iz,jzB);
204             
205             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
206             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
207             rinv        = gmx_mm_invsqrt_ps(rsq);
208             rinvB       = gmx_mm_invsqrt_ps(rsqB);
209             rinv2       = _mm_mul_ps(rinv,rinv);
210             rinv2B      = _mm_mul_ps(rinvB,rinvB);
211             rinv4       = _mm_mul_ps(rinv2,rinv2);
212             rinv4B      = _mm_mul_ps(rinv2B,rinv2B);
213             rinv6       = _mm_mul_ps(rinv4,rinv2);
214             rinv6B      = _mm_mul_ps(rinv4B,rinv2B);
215             
216             rvdw        = _mm_add_ps(rai,raj);
217             rvdwB       = _mm_add_ps(rai,rajB);
218             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
219             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
220
221             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
222             mask_cmpB   = _mm_cmple_ps(ratioB,still_p5inv);
223             
224             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
225             if( 0 == _mm_movemask_ps(mask_cmp) )
226             {
227                 /* if ratio>still_p5inv for ALL elements */
228                 ccf         = one;
229                 dccf        = _mm_setzero_ps();
230             }
231             else 
232             {
233                 ratio       = _mm_min_ps(ratio,still_p5inv);
234                 theta       = _mm_mul_ps(ratio,still_pip5);
235                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
236                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
237                 ccf         = _mm_mul_ps(term,term);
238                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
239                                          _mm_mul_ps(sinq,theta));
240             }
241             if( 0 == _mm_movemask_ps(mask_cmpB) )
242             {
243                 /* if ratio>still_p5inv for ALL elements */
244                 ccfB        = one;
245                 dccfB       = _mm_setzero_ps();
246             }
247             else 
248             {
249                 ratioB      = _mm_min_ps(ratioB,still_p5inv);
250                 thetaB      = _mm_mul_ps(ratioB,still_pip5);
251                 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
252                 termB       = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
253                 ccfB        = _mm_mul_ps(termB,termB);
254                 dccfB       = _mm_mul_ps(_mm_mul_ps(two,termB),
255                                          _mm_mul_ps(sinqB,thetaB));
256             }
257             
258             prod        = _mm_mul_ps(still_p4,vaj);
259             prodB       = _mm_mul_ps(still_p4,vajB);
260             icf4        = _mm_mul_ps(ccf,rinv4);
261             icf4B       = _mm_mul_ps(ccfB,rinv4B);
262             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
263             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
264
265             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
266             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
267             
268             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
269             
270             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
271             dadx+=4;
272             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
273             dadx+=4;
274             _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
275             dadx+=4;
276             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
277             dadx+=4;
278                 } 
279  
280         for(;k<nj1-offset;k+=4)
281                 {
282                         jnrA        = jjnr[k];   
283                         jnrB        = jjnr[k+1];
284                         jnrC        = jjnr[k+2];
285                         jnrD        = jjnr[k+3];
286             
287             j3A         = 3*jnrA;  
288                         j3B         = 3*jnrB;
289                         j3C         = 3*jnrC;
290                         j3D         = 3*jnrD;
291             
292             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
293             
294             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
295                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
296             
297                         dx          = _mm_sub_ps(ix,jx);
298                         dy          = _mm_sub_ps(iy,jy);
299                         dz          = _mm_sub_ps(iz,jz);
300             
301             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
302             rinv        = gmx_mm_invsqrt_ps(rsq);
303             rinv2       = _mm_mul_ps(rinv,rinv);
304             rinv4       = _mm_mul_ps(rinv2,rinv2);
305             rinv6       = _mm_mul_ps(rinv4,rinv2);
306             
307             rvdw        = _mm_add_ps(rai,raj);
308             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
309             
310             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
311
312             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
313             if(0 == _mm_movemask_ps(mask_cmp))
314             {
315                 /* if ratio>still_p5inv for ALL elements */
316                 ccf         = one;
317                 dccf        = _mm_setzero_ps();
318             }
319             else 
320             {
321                 ratio       = _mm_min_ps(ratio,still_p5inv);
322                 theta       = _mm_mul_ps(ratio,still_pip5);
323                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
324                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
325                 ccf         = _mm_mul_ps(term,term);
326                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
327                                          _mm_mul_ps(sinq,theta));
328             }
329             
330             prod        = _mm_mul_ps(still_p4,vaj);
331             icf4        = _mm_mul_ps(ccf,rinv4);
332             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
333
334             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
335             
336             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
337             
338             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
339             dadx+=4;
340             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
341             dadx+=4;
342                 } 
343         
344         if(offset!=0)
345         {
346             if(offset==1)
347             {
348                 jnrA        = jjnr[k];   
349                 j3A         = 3*jnrA;  
350                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
351                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
352                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
353                 mask        = mask1;
354             } 
355             else if(offset==2)
356             {
357                 jnrA        = jjnr[k];   
358                 jnrB        = jjnr[k+1];
359                 j3A         = 3*jnrA;  
360                 j3B         = 3*jnrB;
361                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
362                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
363                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
364                 mask        = mask2;
365             }
366             else
367             {
368                 /* offset must be 3 */   
369                 jnrA        = jjnr[k];   
370                 jnrB        = jjnr[k+1];
371                 jnrC        = jjnr[k+2];
372                 j3A         = 3*jnrA;  
373                 j3B         = 3*jnrB;
374                 j3C         = 3*jnrC;
375                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
376                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
377                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
378                 mask        = mask3;
379             }
380
381                         dx          = _mm_sub_ps(ix,jx);
382                         dy          = _mm_sub_ps(iy,jy);
383                         dz          = _mm_sub_ps(iz,jz);
384             
385             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
386             rinv        = gmx_mm_invsqrt_ps(rsq);
387             rinv2       = _mm_mul_ps(rinv,rinv);
388             rinv4       = _mm_mul_ps(rinv2,rinv2);
389             rinv6       = _mm_mul_ps(rinv4,rinv2);
390             
391             rvdw        = _mm_add_ps(rai,raj);
392             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
393             
394             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
395             
396             if(0 == _mm_movemask_ps(mask_cmp))
397             {
398                 /* if ratio>still_p5inv for ALL elements */
399                 ccf         = one;
400                 dccf        = _mm_setzero_ps();
401             }
402             else 
403             {
404                 ratio       = _mm_min_ps(ratio,still_p5inv);
405                 theta       = _mm_mul_ps(ratio,still_pip5);
406                 gmx_mm_sincos_ps(theta,&sinq,&cosq);            
407                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
408                 ccf         = _mm_mul_ps(term,term);
409                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
410                                          _mm_mul_ps(sinq,theta));
411             }
412
413             prod        = _mm_mul_ps(still_p4,vaj);
414             icf4        = _mm_mul_ps(ccf,rinv4);
415             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
416             
417             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
418             
419             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
420             dadx+=4;
421             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
422             dadx+=4;
423             
424             tmp = _mm_mul_ps(prod_ai,icf4);
425
426             if(offset==1)
427             {
428                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
429             } 
430             else if(offset==2)
431             {
432                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
433             }
434             else
435             {
436                 /* offset must be 3 */
437                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
438             }
439         }
440         GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
441         }
442
443         /* Sum up the polarization energy from other nodes */
444         if(PARTDECOMP(cr))
445         {
446                 gmx_sum(natoms, work, cr);
447         }
448         else if(DOMAINDECOMP(cr))
449         {
450                 dd_atom_sum_real(cr->dd, work);
451         }
452         
453         /* Compute the radii */
454         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
455         {               
456                 if(born->use[i] != 0)
457                 {
458                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
459                         gpi2             = gpi_ai * gpi_ai;
460                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
461                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
462                 }
463         }
464                 
465         /* Extra (local) communication required for DD */
466         if(DOMAINDECOMP(cr))
467         {
468                 dd_atom_spread_real(cr->dd, born->bRad);
469                 dd_atom_spread_real(cr->dd, fr->invsqrta);
470         }
471     
472         return 0;       
473 }
474
475
476 int 
477 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
478                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
479 {
480         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
481     int jnrA,jnrB,jnrC,jnrD;
482     int j3A,j3B,j3C,j3D;
483     int jnrE,jnrF,jnrG,jnrH;
484     int j3E,j3F,j3G,j3H;
485         float shX,shY,shZ;
486         float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
487         float sum_ai2, sum_ai3,tsum,tchain,doffset;
488         float *obc_param;
489     float *gb_radius;
490     float *work;
491     int *  jjnr;
492     float *dadx;
493     float *shiftvec;
494     float min_rad,rad;
495     
496         __m128 ix,iy,iz,jx,jy,jz;
497         __m128 dx,dy,dz,t1,t2,t3,t4;
498         __m128 rsq,rinv,r;
499         __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
500         __m128 uij,lij2,uij2,lij3,uij3,diff2;
501         __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
502         __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
503         __m128 dadx1,dadx2;
504     __m128 logterm;
505         __m128 mask;
506         __m128 obc_mask1,obc_mask2,obc_mask3;
507     __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
508     __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
509         __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
510         __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
511         __m128 lij_invB,sk2_invB,prodB;
512         __m128 sk_ajB,sk2_ajB,sk2_rinvB;
513         __m128 dadx1B,dadx2B;
514     __m128 logtermB;
515     __m128 obc_mask1B,obc_mask2B,obc_mask3B;
516
517     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
518         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
519         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
520         
521     __m128 oneeighth   = _mm_set1_ps(0.125);
522     __m128 onefourth   = _mm_set1_ps(0.25);
523
524         const __m128 half  = _mm_set1_ps(0.5f);
525         const __m128 three = _mm_set1_ps(3.0f);
526         const __m128 one   = _mm_set1_ps(1.0f);
527         const __m128 two   = _mm_set1_ps(2.0f);
528         const __m128 zero  = _mm_set1_ps(0.0f);
529         const __m128 neg   = _mm_set1_ps(-1.0f);
530         
531         /* Set the dielectric offset */
532         doffset   = born->gb_doffset;
533         gb_radius = born->gb_radius;
534     obc_param = born->param;
535     work      = born->gpol_hct_work;
536     jjnr      = nl->jjnr;
537     dadx      = fr->dadx;
538     shiftvec  = fr->shift_vec[0];
539     
540     jx        = _mm_setzero_ps();
541     jy        = _mm_setzero_ps();
542     jz        = _mm_setzero_ps();
543     
544     jnrA = jnrB = jnrC = jnrD = 0;
545     
546         for(i=0;i<born->nr;i++)
547         {
548                 work[i] = 0;
549         }
550         
551         for(i=0;i<nl->nri;i++)
552         {
553         ii     = nl->iinr[i];
554                 ii3        = ii*3;
555         is3    = 3*nl->shift[i];     
556         shX    = shiftvec[is3];  
557         shY    = shiftvec[is3+1];
558         shZ    = shiftvec[is3+2];
559         nj0    = nl->jindex[i];      
560         nj1    = nl->jindex[i+1];    
561         
562         ix     = _mm_set1_ps(shX+x[ii3+0]);
563                 iy     = _mm_set1_ps(shY+x[ii3+1]);
564                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
565                 
566                 offset = (nj1-nj0)%4;
567
568                 rai    = _mm_load1_ps(gb_radius+ii);
569                 rai_inv= gmx_mm_inv_ps(rai);
570                                 
571                 sum_ai = _mm_setzero_ps();
572                 
573                 sk_ai  = _mm_load1_ps(born->param+ii);
574                 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
575                                 
576                 for(k=nj0;k<nj1-4-offset;k+=8)
577                 {
578                         jnrA        = jjnr[k];   
579                         jnrB        = jjnr[k+1];
580                         jnrC        = jjnr[k+2];
581                         jnrD        = jjnr[k+3];
582                         jnrE        = jjnr[k+4];   
583                         jnrF        = jjnr[k+5];
584                         jnrG        = jjnr[k+6];
585                         jnrH        = jjnr[k+7];
586                         
587             j3A         = 3*jnrA;  
588                         j3B         = 3*jnrB;
589                         j3C         = 3*jnrC;
590                         j3D         = 3*jnrD;
591             j3E         = 3*jnrE;  
592                         j3F         = 3*jnrF;
593                         j3G         = 3*jnrG;
594                         j3H         = 3*jnrH;
595             
596             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
597             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
598             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
599             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
600             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
601             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
602                         
603             dx    = _mm_sub_ps(ix, jx);
604                         dy    = _mm_sub_ps(iy, jy);
605                         dz    = _mm_sub_ps(iz, jz);
606             dxB   = _mm_sub_ps(ix, jxB);
607                         dyB   = _mm_sub_ps(iy, jyB);
608                         dzB   = _mm_sub_ps(iz, jzB);
609                         
610             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
611             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
612                                     
613             rinv        = gmx_mm_invsqrt_ps(rsq);
614             r           = _mm_mul_ps(rsq,rinv);
615             rinvB       = gmx_mm_invsqrt_ps(rsqB);
616             rB          = _mm_mul_ps(rsqB,rinvB);
617
618                         /* Compute raj_inv aj1-4 */
619             raj_inv     = gmx_mm_inv_ps(raj);
620             raj_invB    = gmx_mm_inv_ps(rajB);
621
622             /* Evaluate influence of atom aj -> ai */
623             t1            = _mm_add_ps(r,sk_aj);
624             t2            = _mm_sub_ps(r,sk_aj);
625             t3            = _mm_sub_ps(sk_aj,r);
626             t1B           = _mm_add_ps(rB,sk_ajB);
627             t2B           = _mm_sub_ps(rB,sk_ajB);
628             t3B           = _mm_sub_ps(sk_ajB,rB);
629             obc_mask1     = _mm_cmplt_ps(rai, t1);
630             obc_mask2     = _mm_cmplt_ps(rai, t2);
631             obc_mask3     = _mm_cmplt_ps(rai, t3);
632             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
633             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
634             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
635
636             uij           = gmx_mm_inv_ps(t1);
637             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
638                                       _mm_andnot_ps(obc_mask2,rai_inv));
639             dlij          = _mm_and_ps(one,obc_mask2);
640             uij2          = _mm_mul_ps(uij, uij);
641             uij3          = _mm_mul_ps(uij2,uij);
642             lij2          = _mm_mul_ps(lij, lij);
643             lij3          = _mm_mul_ps(lij2,lij);
644
645             uijB          = gmx_mm_inv_ps(t1B);
646             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
647                                       _mm_andnot_ps(obc_mask2B,rai_inv));
648             dlijB         = _mm_and_ps(one,obc_mask2B);
649             uij2B         = _mm_mul_ps(uijB, uijB);
650             uij3B         = _mm_mul_ps(uij2B,uijB);
651             lij2B         = _mm_mul_ps(lijB, lijB);
652             lij3B         = _mm_mul_ps(lij2B,lijB);
653
654             diff2         = _mm_sub_ps(uij2,lij2);
655             lij_inv       = gmx_mm_invsqrt_ps(lij2);
656             sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
657             sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
658             prod          = _mm_mul_ps(onefourth,sk2_rinv);
659
660             diff2B        = _mm_sub_ps(uij2B,lij2B);
661             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
662             sk2_ajB       = _mm_mul_ps(sk_ajB,sk_ajB);
663             sk2_rinvB     = _mm_mul_ps(sk2_ajB,rinvB);
664             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
665
666             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
667             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
668             
669             t1            = _mm_sub_ps(lij,uij);
670             t2            = _mm_mul_ps(diff2,
671                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
672                                                   prod));
673             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
674             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
675             t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
676             t4            = _mm_and_ps(t4,obc_mask3);
677             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
678             
679             t1B           = _mm_sub_ps(lijB,uijB);
680             t2B           = _mm_mul_ps(diff2B,
681                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
682                                                   prodB));
683             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
684             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
685             t4B           = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
686             t4B           = _mm_and_ps(t4B,obc_mask3B);
687             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
688             
689             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
690             
691             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
692                                        _mm_mul_ps(prod,lij3));
693             t1            = _mm_sub_ps(t1,
694                                        _mm_mul_ps(onefourth,
695                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
696                                                              _mm_mul_ps(lij3,r))));
697             t2            = _mm_mul_ps(onefourth,
698                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
699                                                   _mm_mul_ps(uij3,r)));
700             t2            = _mm_sub_ps(t2,
701                                        _mm_add_ps(_mm_mul_ps(half,uij2),
702                                                       _mm_mul_ps(prod,uij3)));
703             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
704                                        _mm_mul_ps(rinv,rinv));
705             t3            = _mm_sub_ps(t3,
706                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
707                                                   _mm_add_ps(one,
708                                                              _mm_mul_ps(sk2_rinv,rinv))));
709             t1            = _mm_mul_ps(rinv,
710                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
711                                                   _mm_add_ps(t2,t3)));
712             
713
714             
715             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
716                                        _mm_mul_ps(prodB,lij3B));
717             t1B           = _mm_sub_ps(t1B,
718                                        _mm_mul_ps(onefourth,
719                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
720                                                              _mm_mul_ps(lij3B,rB))));
721             t2B           = _mm_mul_ps(onefourth,
722                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
723                                                   _mm_mul_ps(uij3B,rB)));
724             t2B           = _mm_sub_ps(t2B,
725                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
726                                                   _mm_mul_ps(prodB,uij3B)));
727             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
728                                        _mm_mul_ps(rinvB,rinvB));
729             t3B           = _mm_sub_ps(t3B,
730                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
731                                                   _mm_add_ps(one,
732                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
733             t1B           = _mm_mul_ps(rinvB,
734                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
735                                                   _mm_add_ps(t2B,t3B)));
736             
737             dadx1         = _mm_and_ps(t1,obc_mask1);
738             dadx1B        = _mm_and_ps(t1B,obc_mask1B);
739
740
741             /* Evaluate influence of atom ai -> aj */
742             t1            = _mm_add_ps(r,sk_ai);
743             t2            = _mm_sub_ps(r,sk_ai);
744             t3            = _mm_sub_ps(sk_ai,r);
745             t1B           = _mm_add_ps(rB,sk_ai);
746             t2B           = _mm_sub_ps(rB,sk_ai);
747             t3B           = _mm_sub_ps(sk_ai,rB);
748             obc_mask1     = _mm_cmplt_ps(raj, t1);
749             obc_mask2     = _mm_cmplt_ps(raj, t2);
750             obc_mask3     = _mm_cmplt_ps(raj, t3);
751             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
752             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
753             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
754             
755             uij           = gmx_mm_inv_ps(t1);
756             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
757                                       _mm_andnot_ps(obc_mask2,raj_inv));
758             dlij          = _mm_and_ps(one,obc_mask2);
759             uij2          = _mm_mul_ps(uij, uij);
760             uij3          = _mm_mul_ps(uij2,uij);
761             lij2          = _mm_mul_ps(lij, lij);
762             lij3          = _mm_mul_ps(lij2,lij);
763
764             uijB          = gmx_mm_inv_ps(t1B);
765             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
766                                       _mm_andnot_ps(obc_mask2B,raj_invB));
767             dlijB         = _mm_and_ps(one,obc_mask2B);
768             uij2B         = _mm_mul_ps(uijB, uijB);
769             uij3B         = _mm_mul_ps(uij2B,uijB);
770             lij2B         = _mm_mul_ps(lijB, lijB);
771             lij3B         = _mm_mul_ps(lij2B,lijB);
772
773             diff2         = _mm_sub_ps(uij2,lij2);
774             lij_inv       = gmx_mm_invsqrt_ps(lij2);
775             sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
776             prod          = _mm_mul_ps(onefourth,sk2_rinv);
777
778             diff2B        = _mm_sub_ps(uij2B,lij2B);
779             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
780             sk2_rinvB     = _mm_mul_ps(sk2_ai,rinvB);
781             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
782
783             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
784             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
785
786             t1            = _mm_sub_ps(lij,uij);
787             t2            = _mm_mul_ps(diff2,
788                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
789                                                   prod));
790             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
791             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
792             t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
793             t4            = _mm_and_ps(t4,obc_mask3);
794             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
795
796             t1B           = _mm_sub_ps(lijB,uijB);
797             t2B           = _mm_mul_ps(diff2B,
798                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
799                                                   prodB));
800             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
801             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
802             t4B           = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
803             t4B           = _mm_and_ps(t4B,obc_mask3B);
804             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
805             
806             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
807             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
808             
809             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
810                                        _mm_mul_ps(prod,lij3));
811             t1            = _mm_sub_ps(t1,
812                                        _mm_mul_ps(onefourth,
813                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
814                                                              _mm_mul_ps(lij3,r))));
815             t2            = _mm_mul_ps(onefourth,
816                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
817                                                   _mm_mul_ps(uij3,r)));
818             t2            = _mm_sub_ps(t2,
819                                        _mm_add_ps(_mm_mul_ps(half,uij2),
820                                                   _mm_mul_ps(prod,uij3)));
821             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
822                                        _mm_mul_ps(rinv,rinv));
823             t3            = _mm_sub_ps(t3,
824                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
825                                                   _mm_add_ps(one,
826                                                              _mm_mul_ps(sk2_rinv,rinv))));
827             t1            = _mm_mul_ps(rinv,
828                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
829                                                   _mm_add_ps(t2,t3)));
830
831             
832             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
833                                        _mm_mul_ps(prodB,lij3B));
834             t1B           = _mm_sub_ps(t1B,
835                                        _mm_mul_ps(onefourth,
836                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
837                                                              _mm_mul_ps(lij3B,rB))));
838             t2B           = _mm_mul_ps(onefourth,
839                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
840                                                   _mm_mul_ps(uij3B,rB)));
841             t2B           = _mm_sub_ps(t2B,
842                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
843                                                   _mm_mul_ps(prodB,uij3B)));
844             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
845                                        _mm_mul_ps(rinvB,rinvB));
846             t3B           = _mm_sub_ps(t3B,
847                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
848                                                   _mm_add_ps(one,
849                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
850             t1B           = _mm_mul_ps(rinvB,
851                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
852                                                   _mm_add_ps(t2B,t3B)));
853
854             
855             dadx2         = _mm_and_ps(t1,obc_mask1);
856             dadx2B        = _mm_and_ps(t1B,obc_mask1B);
857             
858             _mm_store_ps(dadx,dadx1);
859             dadx += 4;
860             _mm_store_ps(dadx,dadx2);
861             dadx += 4;
862             _mm_store_ps(dadx,dadx1B);
863             dadx += 4;
864             _mm_store_ps(dadx,dadx2B);
865             dadx += 4;
866             
867         } /* end normal inner loop */
868         
869                 for(;k<nj1-offset;k+=4)
870                 {
871                         jnrA        = jjnr[k];   
872                         jnrB        = jjnr[k+1];
873                         jnrC        = jjnr[k+2];
874                         jnrD        = jjnr[k+3];
875                         
876             j3A         = 3*jnrA;  
877                         j3B         = 3*jnrB;
878                         j3C         = 3*jnrC;
879                         j3D         = 3*jnrD;
880             
881             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
882             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
883             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
884                         
885             dx    = _mm_sub_ps(ix, jx);
886                         dy    = _mm_sub_ps(iy, jy);
887                         dz    = _mm_sub_ps(iz, jz);
888                         
889             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
890             
891             rinv        = gmx_mm_invsqrt_ps(rsq);
892             r           = _mm_mul_ps(rsq,rinv);
893             
894                         /* Compute raj_inv aj1-4 */
895             raj_inv     = gmx_mm_inv_ps(raj);
896             
897             /* Evaluate influence of atom aj -> ai */
898             t1            = _mm_add_ps(r,sk_aj);
899             obc_mask1     = _mm_cmplt_ps(rai, t1);
900             
901             if(_mm_movemask_ps(obc_mask1))
902             {
903                 /* If any of the elements has rai<dr+sk, this is executed */
904                 t2            = _mm_sub_ps(r,sk_aj);
905                 t3            = _mm_sub_ps(sk_aj,r);
906                 
907                 obc_mask2     = _mm_cmplt_ps(rai, t2);
908                 obc_mask3     = _mm_cmplt_ps(rai, t3);
909                 
910                 uij           = gmx_mm_inv_ps(t1);
911                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
912                                           _mm_andnot_ps(obc_mask2,rai_inv));
913                 dlij          = _mm_and_ps(one,obc_mask2);
914                 uij2          = _mm_mul_ps(uij, uij);
915                 uij3          = _mm_mul_ps(uij2,uij);
916                 lij2          = _mm_mul_ps(lij, lij);
917                 lij3          = _mm_mul_ps(lij2,lij);
918                 diff2         = _mm_sub_ps(uij2,lij2);
919                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
920                 sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
921                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
922                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
923                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
924                 t1            = _mm_sub_ps(lij,uij);
925                 t2            = _mm_mul_ps(diff2,
926                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
927                                                       prod));
928                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
929                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
930                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
931                 t4            = _mm_and_ps(t4,obc_mask3);
932                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
933                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
934                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
935                                            _mm_mul_ps(prod,lij3));
936                 t1            = _mm_sub_ps(t1,
937                                            _mm_mul_ps(onefourth,
938                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
939                                                                  _mm_mul_ps(lij3,r))));
940                 t2            = _mm_mul_ps(onefourth,
941                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
942                                                       _mm_mul_ps(uij3,r)));
943                 t2            = _mm_sub_ps(t2,
944                                            _mm_add_ps(_mm_mul_ps(half,uij2),
945                                                       _mm_mul_ps(prod,uij3)));
946                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
947                                            _mm_mul_ps(rinv,rinv));
948                 t3            = _mm_sub_ps(t3,
949                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
950                                                       _mm_add_ps(one,
951                                                                  _mm_mul_ps(sk2_rinv,rinv))));
952                 t1            = _mm_mul_ps(rinv,
953                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
954                                                       _mm_add_ps(t2,t3)));
955                 
956                 dadx1         = _mm_and_ps(t1,obc_mask1);
957             }
958             else 
959             {
960                 dadx1         = _mm_setzero_ps();
961             }
962             
963             /* Evaluate influence of atom ai -> aj */
964             t1            = _mm_add_ps(r,sk_ai);
965             obc_mask1     = _mm_cmplt_ps(raj, t1);
966             
967             if(_mm_movemask_ps(obc_mask1))
968             {
969                 t2            = _mm_sub_ps(r,sk_ai);
970                 t3            = _mm_sub_ps(sk_ai,r);
971                 obc_mask2     = _mm_cmplt_ps(raj, t2);
972                 obc_mask3     = _mm_cmplt_ps(raj, t3);
973                 
974                 uij           = gmx_mm_inv_ps(t1);
975                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
976                                           _mm_andnot_ps(obc_mask2,raj_inv));
977                 dlij          = _mm_and_ps(one,obc_mask2);
978                 uij2          = _mm_mul_ps(uij, uij);
979                 uij3          = _mm_mul_ps(uij2,uij);
980                 lij2          = _mm_mul_ps(lij, lij);
981                 lij3          = _mm_mul_ps(lij2,lij);
982                 diff2         = _mm_sub_ps(uij2,lij2);
983                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
984                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
985                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
986                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
987                 t1            = _mm_sub_ps(lij,uij);
988                 t2            = _mm_mul_ps(diff2,
989                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
990                                                       prod));
991                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
992                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
993                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
994                 t4            = _mm_and_ps(t4,obc_mask3);
995                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
996                 
997                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
998                 
999                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1000                                            _mm_mul_ps(prod,lij3));
1001                 t1            = _mm_sub_ps(t1,
1002                                            _mm_mul_ps(onefourth,
1003                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1004                                                                  _mm_mul_ps(lij3,r))));
1005                 t2            = _mm_mul_ps(onefourth,
1006                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1007                                                       _mm_mul_ps(uij3,r)));
1008                 t2            = _mm_sub_ps(t2,
1009                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1010                                                       _mm_mul_ps(prod,uij3)));
1011                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1012                                            _mm_mul_ps(rinv,rinv));
1013                 t3            = _mm_sub_ps(t3,
1014                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1015                                                       _mm_add_ps(one,
1016                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1017                 t1            = _mm_mul_ps(rinv,
1018                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1019                                                       _mm_add_ps(t2,t3)));
1020                 dadx2         = _mm_and_ps(t1,obc_mask1);
1021             }
1022             else
1023             {
1024                 dadx2         = _mm_setzero_ps();
1025             }
1026             
1027             _mm_store_ps(dadx,dadx1);
1028             dadx += 4;
1029             _mm_store_ps(dadx,dadx2);
1030             dadx += 4;            
1031         } /* end normal inner loop */
1032         
1033         if(offset!=0)
1034         {
1035             if(offset==1)
1036             {
1037                 jnrA        = jjnr[k];   
1038                 j3A         = 3*jnrA;  
1039                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1040                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1041                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1042                 mask        = mask1;
1043             } 
1044             else if(offset==2)
1045             {
1046                 jnrA        = jjnr[k];   
1047                 jnrB        = jjnr[k+1];
1048                 j3A         = 3*jnrA;  
1049                 j3B         = 3*jnrB;
1050                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1051                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1052                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1053                 mask        = mask2;
1054             }
1055             else
1056             {
1057                 /* offset must be 3 */   
1058                 jnrA        = jjnr[k];   
1059                 jnrB        = jjnr[k+1];
1060                 jnrC        = jjnr[k+2];
1061                 j3A         = 3*jnrA;  
1062                 j3B         = 3*jnrB;
1063                 j3C         = 3*jnrC;
1064                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1065                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1066                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1067                 mask        = mask3;
1068             }
1069
1070             dx    = _mm_sub_ps(ix, jx);
1071                         dy    = _mm_sub_ps(iy, jy);
1072                         dz    = _mm_sub_ps(iz, jz);
1073                         
1074             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
1075             
1076             rinv        = gmx_mm_invsqrt_ps(rsq);
1077             r           = _mm_mul_ps(rsq,rinv);
1078             
1079                         /* Compute raj_inv aj1-4 */
1080             raj_inv     = gmx_mm_inv_ps(raj);
1081             
1082             /* Evaluate influence of atom aj -> ai */
1083             t1            = _mm_add_ps(r,sk_aj);
1084             obc_mask1     = _mm_cmplt_ps(rai, t1);
1085             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1086
1087             if(_mm_movemask_ps(obc_mask1))
1088             {
1089                 t2            = _mm_sub_ps(r,sk_aj);
1090                 t3            = _mm_sub_ps(sk_aj,r);
1091                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1092                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1093                 
1094                 uij           = gmx_mm_inv_ps(t1);
1095                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1096                                           _mm_andnot_ps(obc_mask2,rai_inv));
1097                 dlij          = _mm_and_ps(one,obc_mask2);
1098                 uij2          = _mm_mul_ps(uij, uij);
1099                 uij3          = _mm_mul_ps(uij2,uij);
1100                 lij2          = _mm_mul_ps(lij, lij);
1101                 lij3          = _mm_mul_ps(lij2,lij);
1102                 diff2         = _mm_sub_ps(uij2,lij2);
1103                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1104                 sk2_aj         = _mm_mul_ps(sk_aj,sk_aj);
1105                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
1106                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1107                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1108                 t1            = _mm_sub_ps(lij,uij);
1109                 t2            = _mm_mul_ps(diff2,
1110                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1111                                                       prod));
1112                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1113                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1114                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1115                 t4            = _mm_and_ps(t4,obc_mask3);
1116                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1117                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1118                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1119                                            _mm_mul_ps(prod,lij3));
1120                 t1            = _mm_sub_ps(t1,
1121                                            _mm_mul_ps(onefourth,
1122                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1123                                                                  _mm_mul_ps(lij3,r))));
1124                 t2            = _mm_mul_ps(onefourth,
1125                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1126                                                       _mm_mul_ps(uij3,r)));
1127                 t2            = _mm_sub_ps(t2,
1128                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1129                                                       _mm_mul_ps(prod,uij3)));
1130                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1131                                            _mm_mul_ps(rinv,rinv));
1132                 t3            = _mm_sub_ps(t3,
1133                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1134                                                       _mm_add_ps(one,
1135                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1136                 t1            = _mm_mul_ps(rinv,
1137                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1138                                                       _mm_add_ps(t2,t3)));
1139                 dadx1         = _mm_and_ps(t1,obc_mask1);
1140             }
1141             else
1142             {
1143                 dadx1         = _mm_setzero_ps();
1144             }
1145
1146                 /* Evaluate influence of atom ai -> aj */
1147             t1            = _mm_add_ps(r,sk_ai);
1148             obc_mask1     = _mm_cmplt_ps(raj, t1);
1149             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1150             
1151             if(_mm_movemask_ps(obc_mask1))
1152             {
1153                 t2            = _mm_sub_ps(r,sk_ai);
1154                 t3            = _mm_sub_ps(sk_ai,r);
1155                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1156                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1157             
1158                 uij           = gmx_mm_inv_ps(t1);
1159                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1160                                           _mm_andnot_ps(obc_mask2,raj_inv));
1161                 dlij          = _mm_and_ps(one,obc_mask2);
1162                 uij2          = _mm_mul_ps(uij, uij);
1163                 uij3          = _mm_mul_ps(uij2,uij);
1164                 lij2          = _mm_mul_ps(lij, lij);
1165                 lij3          = _mm_mul_ps(lij2,lij);
1166                 diff2         = _mm_sub_ps(uij2,lij2);
1167                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1168                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
1169                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1170                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1171                 t1            = _mm_sub_ps(lij,uij);
1172                 t2            = _mm_mul_ps(diff2,
1173                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1174                                                       prod));
1175                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1176                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1177                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1178                 t4            = _mm_and_ps(t4,obc_mask3);
1179                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1180                 
1181                 tmp           = _mm_and_ps(t1,obc_mask1);
1182                 
1183                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1184                                            _mm_mul_ps(prod,lij3));
1185                 t1            = _mm_sub_ps(t1,
1186                                            _mm_mul_ps(onefourth,
1187                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1188                                                                  _mm_mul_ps(lij3,r))));
1189                 t2            = _mm_mul_ps(onefourth,
1190                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1191                                                       _mm_mul_ps(uij3,r)));
1192                 t2            = _mm_sub_ps(t2,
1193                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1194                                                       _mm_mul_ps(prod,uij3)));
1195                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1196                                            _mm_mul_ps(rinv,rinv));
1197                 t3            = _mm_sub_ps(t3,
1198                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1199                                                       _mm_add_ps(one,
1200                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1201                 t1            = _mm_mul_ps(rinv,
1202                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1203                                                       _mm_add_ps(t2,t3)));
1204                 dadx2         = _mm_and_ps(t1,obc_mask1);
1205             }
1206             else
1207             {
1208                 dadx2         = _mm_setzero_ps();
1209                 tmp           = _mm_setzero_ps();
1210             }
1211             
1212             _mm_store_ps(dadx,dadx1);
1213             dadx += 4;
1214             _mm_store_ps(dadx,dadx2);
1215             dadx += 4;
1216             
1217             if(offset==1)
1218             {
1219                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1220             } 
1221             else if(offset==2)
1222             {
1223                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1224             }
1225             else
1226             {
1227                 /* offset must be 3 */
1228                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1229             }
1230             
1231         }
1232         GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1233         
1234         }
1235         
1236         /* Parallel summations */
1237         if(PARTDECOMP(cr))
1238         {
1239                 gmx_sum(natoms, work, cr);
1240         }
1241         else if(DOMAINDECOMP(cr))
1242         {
1243                 dd_atom_sum_real(cr->dd, work);
1244         }
1245         
1246     if(gb_algorithm==egbHCT)
1247     {
1248         /* HCT */
1249         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1250         {
1251                         if(born->use[i] != 0)
1252             {
1253                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
1254                 sum     = 1.0/rr - work[i];
1255                 min_rad = rr + doffset;
1256                 rad     = 1.0/sum; 
1257                 
1258                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1259                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1260             }
1261         }
1262         
1263         /* Extra communication required for DD */
1264         if(DOMAINDECOMP(cr))
1265         {
1266             dd_atom_spread_real(cr->dd, born->bRad);
1267             dd_atom_spread_real(cr->dd, fr->invsqrta);
1268         }
1269     }
1270     else
1271     {
1272         /* OBC */
1273         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1274         {
1275                         if(born->use[i] != 0)
1276             {
1277                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1278                 rr_inv2 = 1.0/rr;
1279                 rr      = rr-doffset; 
1280                 rr_inv  = 1.0/rr;
1281                 sum     = rr * work[i];
1282                 sum2    = sum  * sum;
1283                 sum3    = sum2 * sum;
1284                 
1285                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1286                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1287                 born->bRad[i] = 1.0 / born->bRad[i];
1288                 
1289                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1290                 
1291                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1292                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1293             }
1294         }
1295         /* Extra (local) communication required for DD */
1296         if(DOMAINDECOMP(cr))
1297         {
1298             dd_atom_spread_real(cr->dd, born->bRad);
1299             dd_atom_spread_real(cr->dd, fr->invsqrta);
1300             dd_atom_spread_real(cr->dd, born->drobc);
1301         }
1302     }
1303
1304         
1305         
1306         return 0;
1307 }
1308
1309
1310
1311 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda, 
1312                                     float *x, float *f, float *fshift, float *shiftvec,
1313                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                               
1314 {
1315         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1316         int        jnrA,jnrB,jnrC,jnrD;
1317     int    j3A,j3B,j3C,j3D;
1318         int        jnrE,jnrF,jnrG,jnrH;
1319     int    j3E,j3F,j3G,j3H;
1320         int *  jjnr;
1321     
1322         float   rbi,shX,shY,shZ;
1323         float   *rb;
1324     
1325         __m128 ix,iy,iz;
1326         __m128 jx,jy,jz;
1327         __m128 jxB,jyB,jzB;
1328         __m128 fix,fiy,fiz;
1329         __m128 dx,dy,dz;
1330     __m128 tx,ty,tz;
1331         __m128 dxB,dyB,dzB;
1332     __m128 txB,tyB,tzB;
1333
1334         __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1335         __m128 xmm1,xmm2,xmm3;
1336         
1337         const __m128 two = _mm_set1_ps(2.0f);
1338     
1339         rb     = born->work; 
1340                         
1341     jjnr   = nl->jjnr;
1342     
1343         /* Loop to get the proper form for the Born radius term, sse style */
1344         offset=natoms%4;
1345         
1346   n0 = 0;
1347   n1 = natoms;
1348   
1349         if(gb_algorithm==egbSTILL) 
1350         {
1351                 for(i=n0;i<n1;i++)
1352                 {
1353       rbi   = born->bRad[i];
1354                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1355                 }
1356         }
1357         else if(gb_algorithm==egbHCT) 
1358         {
1359                 for(i=n0;i<n1;i++)
1360                 {
1361       rbi   = born->bRad[i];
1362                         rb[i] = rbi * rbi * dvda[i];
1363                 }
1364         }
1365         else if(gb_algorithm==egbOBC) 
1366         {
1367                 for(i=n0;i<n1;i++)
1368                 {
1369       rbi   = born->bRad[i];
1370                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1371                 }
1372         }
1373     
1374     jz = _mm_setzero_ps();
1375     
1376     n = j3A = j3B = j3C = j3D = 0;
1377     
1378         for(i=0;i<nl->nri;i++)
1379         {
1380         ii     = nl->iinr[i];
1381                 ii3        = ii*3;
1382         is3    = 3*nl->shift[i];     
1383         shX    = shiftvec[is3];  
1384         shY    = shiftvec[is3+1];
1385         shZ    = shiftvec[is3+2];
1386         nj0    = nl->jindex[i];      
1387         nj1    = nl->jindex[i+1];    
1388
1389         ix     = _mm_set1_ps(shX+x[ii3+0]);
1390                 iy     = _mm_set1_ps(shY+x[ii3+1]);
1391                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
1392                 
1393                 offset = (nj1-nj0)%4;
1394                 
1395                 rbai   = _mm_load1_ps(rb+ii);                   
1396                 fix    = _mm_setzero_ps();
1397                 fiy    = _mm_setzero_ps();
1398                 fiz    = _mm_setzero_ps();      
1399                                 
1400
1401         for(k=nj0;k<nj1-offset;k+=4)
1402                 {
1403                         jnrA        = jjnr[k];   
1404                         jnrB        = jjnr[k+1];
1405                         jnrC        = jjnr[k+2];
1406                         jnrD        = jjnr[k+3];
1407             
1408             j3A         = 3*jnrA;  
1409                         j3B         = 3*jnrB;
1410                         j3C         = 3*jnrC;
1411                         j3D         = 3*jnrD;
1412             
1413             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1414             
1415                         dx          = _mm_sub_ps(ix,jx);
1416                         dy          = _mm_sub_ps(iy,jy);
1417                         dz          = _mm_sub_ps(iz,jz);
1418             
1419             GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1420             
1421                         /* load chain rule terms for j1-4 */
1422                         f_gb        = _mm_load_ps(dadx);
1423                         dadx += 4;
1424                         f_gb_ai     = _mm_load_ps(dadx);
1425                         dadx += 4;
1426                         
1427             /* calculate scalar force */
1428             f_gb    = _mm_mul_ps(f_gb,rbai); 
1429             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1430             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1431             
1432             tx     = _mm_mul_ps(f_gb,dx);
1433             ty     = _mm_mul_ps(f_gb,dy);
1434             tz     = _mm_mul_ps(f_gb,dz);
1435             
1436             fix    = _mm_add_ps(fix,tx);
1437             fiy    = _mm_add_ps(fiy,ty);
1438             fiz    = _mm_add_ps(fiz,tz);
1439             
1440             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1441                 }
1442         
1443                 /*deal with odd elements */
1444                 if(offset!=0) 
1445         {
1446             if(offset==1)
1447             {
1448                 jnrA        = jjnr[k];   
1449                 j3A         = 3*jnrA; 
1450                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1451                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1452             } 
1453             else if(offset==2)
1454             {
1455                 jnrA        = jjnr[k];   
1456                 jnrB        = jjnr[k+1];
1457                 j3A         = 3*jnrA;  
1458                 j3B         = 3*jnrB;
1459                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1460                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1461             }
1462             else
1463             {
1464                 /* offset must be 3 */   
1465                 jnrA        = jjnr[k];   
1466                 jnrB        = jjnr[k+1];
1467                 jnrC        = jjnr[k+2];
1468                 j3A         = 3*jnrA;  
1469                 j3B         = 3*jnrB;
1470                 j3C         = 3*jnrC;
1471                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1472                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1473             }
1474             
1475             dx          = _mm_sub_ps(ix,jx);
1476             dy          = _mm_sub_ps(iy,jy);
1477             dz          = _mm_sub_ps(iz,jz);
1478             
1479             /* load chain rule terms for j1-4 */
1480             f_gb        = _mm_load_ps(dadx);
1481             dadx += 4;
1482             f_gb_ai     = _mm_load_ps(dadx);
1483             dadx += 4;
1484             
1485             /* calculate scalar force */
1486             f_gb    = _mm_mul_ps(f_gb,rbai); 
1487             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1488             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1489             
1490             tx     = _mm_mul_ps(f_gb,dx);
1491             ty     = _mm_mul_ps(f_gb,dy);
1492             tz     = _mm_mul_ps(f_gb,dz);
1493             
1494             fix    = _mm_add_ps(fix,tx);
1495             fiy    = _mm_add_ps(fiy,ty);
1496             fiz    = _mm_add_ps(fiz,tz);
1497             
1498             if(offset==1)
1499             {
1500                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1501             } 
1502             else if(offset==2)
1503             {
1504                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1505             }
1506             else
1507             {
1508                 /* offset must be 3 */
1509                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1510             }
1511         } 
1512         
1513                 /* fix/fiy/fiz now contain four partial force terms, that all should be
1514          * added to the i particle forces and shift forces. 
1515          */
1516                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1517         }       
1518     
1519         return 0;       
1520 }
1521
1522
1523 #else
1524 /* keep compiler happy */
1525 int genborn_sse_dummy;
1526
1527 #endif /* SSE intrinsics available */