Fixing copyright issues and code contributors
[alexxy/gromacs.git] / src / mdlib / genborn_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team,
6  * check out http://www.gromacs.org for more information.
7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9  * others, as listed in the AUTHORS file in the top-level source
10  * directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43 #include <string.h>
44
45 #include "typedefs.h"
46 #include "smalloc.h"
47 #include "genborn.h"
48 #include "vec.h"
49 #include "grompp.h"
50 #include "pdbio.h"
51 #include "names.h"
52 #include "physics.h"
53 #include "partdec.h"
54 #include "domdec.h"
55 #include "network.h"
56 #include "gmx_fatal.h"
57 #include "mtop_util.h"
58 #include "genborn.h"
59
60 #ifdef GMX_LIB_MPI
61 #include <mpi.h>
62 #endif
63 #ifdef GMX_THREAD_MPI
64 #include "tmpi.h"
65 #endif
66
67
68 /* Only compile this file if SSE intrinsics are available */
69 #if 0 && defined (GMX_X86_SSE2)
70
71 #include <gmx_sse2_single.h>
72 #include <emmintrin.h>
73
74 #include "genborn_sse2_single.h"
75
76
77 int 
78 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
79                               int natoms, gmx_localtop_t *top,
80                               const t_atomtypes *atype, float *x, t_nblist *nl,
81                               gmx_genborn_t *born)
82 {
83         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
84         int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
85         int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
86         int shift;
87     int *mdtype;
88         real shX,shY,shZ;
89     int *jjnr;
90     real *shiftvec;
91
92         float gpi_ai,gpi2;
93         float factor;
94         float *gb_radius;
95     float *vsolv;
96     float *work;
97     float *dadx;
98     
99         __m128 ix,iy,iz;
100         __m128 jx,jy,jz;
101         __m128 dx,dy,dz;
102         __m128 tx,ty,tz;
103         __m128 jxB,jyB,jzB;
104         __m128 dxB,dyB,dzB;
105         __m128 txB,tyB,tzB;
106         __m128 rsq,rinv,rinv2,rinv4,rinv6;
107         __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
108         __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
109         __m128 ratioB,rajB,vajB,rvdwB;
110         __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
111         __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
112         __m128 mask,icf4,icf6,mask_cmp;
113         __m128 icf4B,icf6B,mask_cmpB;
114         
115     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
116         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
117         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
118     
119         const __m128 half   = _mm_set1_ps(0.5f);
120         const __m128 three  = _mm_set1_ps(3.0f);
121         const __m128 one    = _mm_set1_ps(1.0f);
122         const __m128 two    = _mm_set1_ps(2.0f);
123         const __m128 zero   = _mm_set1_ps(0.0f);
124         const __m128 four   = _mm_set1_ps(4.0f);
125         
126         const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
127         const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
128         const __m128 still_p4     = _mm_set1_ps(STILL_P4);
129                 
130         factor  = 0.5 * ONE_4PI_EPS0;
131                 
132     gb_radius = born->gb_radius;
133     vsolv     = born->vsolv;
134     work      = born->gpol_still_work;
135         jjnr      = nl->jjnr;
136     shiftvec  = fr->shift_vec[0];
137     dadx      = fr->dadx;
138     
139         jnrA = jnrB = jnrC = jnrD = 0;
140     jx = _mm_setzero_ps();
141     jy = _mm_setzero_ps();
142     jz = _mm_setzero_ps();
143     
144         n = 0;
145     
146         for(i=0;i<natoms;i++)
147         {
148                 work[i]=0;
149         }
150
151         for(i=0;i<nl->nri;i++)
152         {
153         ii     = nl->iinr[i];
154                 ii3        = ii*3;
155         is3    = 3*nl->shift[i];     
156         shX    = shiftvec[is3];  
157         shY    = shiftvec[is3+1];
158         shZ    = shiftvec[is3+2];
159         nj0    = nl->jindex[i];      
160         nj1    = nl->jindex[i+1];    
161         
162         ix     = _mm_set1_ps(shX+x[ii3+0]);
163                 iy     = _mm_set1_ps(shY+x[ii3+1]);
164                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
165                 
166                 offset = (nj1-nj0)%4;
167                 
168                 /* Polarization energy for atom ai */
169                 gpi    = _mm_setzero_ps();
170                 
171         rai     = _mm_load1_ps(gb_radius+ii);
172         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
173         
174                 for(k=nj0;k<nj1-4-offset;k+=8)
175                 {
176                         jnrA        = jjnr[k];   
177                         jnrB        = jjnr[k+1];
178                         jnrC        = jjnr[k+2];
179                         jnrD        = jjnr[k+3];
180                         jnrE        = jjnr[k+4];   
181                         jnrF        = jjnr[k+5];
182                         jnrG        = jjnr[k+6];
183                         jnrH        = jjnr[k+7];
184             
185             j3A         = 3*jnrA;  
186                         j3B         = 3*jnrB;
187                         j3C         = 3*jnrC;
188                         j3D         = 3*jnrD;
189             j3E         = 3*jnrE;  
190                         j3F         = 3*jnrF;
191                         j3G         = 3*jnrG;
192                         j3H         = 3*jnrH;
193             
194             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
195             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
196             
197             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
198             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
199                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
200                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
201
202                         dx          = _mm_sub_ps(ix,jx);
203                         dy          = _mm_sub_ps(iy,jy);
204                         dz          = _mm_sub_ps(iz,jz);
205                         dxB         = _mm_sub_ps(ix,jxB);
206                         dyB         = _mm_sub_ps(iy,jyB);
207                         dzB         = _mm_sub_ps(iz,jzB);
208             
209             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
210             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
211             rinv        = gmx_mm_invsqrt_ps(rsq);
212             rinvB       = gmx_mm_invsqrt_ps(rsqB);
213             rinv2       = _mm_mul_ps(rinv,rinv);
214             rinv2B      = _mm_mul_ps(rinvB,rinvB);
215             rinv4       = _mm_mul_ps(rinv2,rinv2);
216             rinv4B      = _mm_mul_ps(rinv2B,rinv2B);
217             rinv6       = _mm_mul_ps(rinv4,rinv2);
218             rinv6B      = _mm_mul_ps(rinv4B,rinv2B);
219             
220             rvdw        = _mm_add_ps(rai,raj);
221             rvdwB       = _mm_add_ps(rai,rajB);
222             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
223             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
224
225             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
226             mask_cmpB   = _mm_cmple_ps(ratioB,still_p5inv);
227             
228             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
229             if( 0 == _mm_movemask_ps(mask_cmp) )
230             {
231                 /* if ratio>still_p5inv for ALL elements */
232                 ccf         = one;
233                 dccf        = _mm_setzero_ps();
234             }
235             else 
236             {
237                 ratio       = _mm_min_ps(ratio,still_p5inv);
238                 theta       = _mm_mul_ps(ratio,still_pip5);
239                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
240                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
241                 ccf         = _mm_mul_ps(term,term);
242                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
243                                          _mm_mul_ps(sinq,theta));
244             }
245             if( 0 == _mm_movemask_ps(mask_cmpB) )
246             {
247                 /* if ratio>still_p5inv for ALL elements */
248                 ccfB        = one;
249                 dccfB       = _mm_setzero_ps();
250             }
251             else 
252             {
253                 ratioB      = _mm_min_ps(ratioB,still_p5inv);
254                 thetaB      = _mm_mul_ps(ratioB,still_pip5);
255                 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
256                 termB       = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
257                 ccfB        = _mm_mul_ps(termB,termB);
258                 dccfB       = _mm_mul_ps(_mm_mul_ps(two,termB),
259                                          _mm_mul_ps(sinqB,thetaB));
260             }
261             
262             prod        = _mm_mul_ps(still_p4,vaj);
263             prodB       = _mm_mul_ps(still_p4,vajB);
264             icf4        = _mm_mul_ps(ccf,rinv4);
265             icf4B       = _mm_mul_ps(ccfB,rinv4B);
266             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
267             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
268
269             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
270             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
271             
272             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
273             
274             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
275             dadx+=4;
276             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
277             dadx+=4;
278             _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
279             dadx+=4;
280             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
281             dadx+=4;
282                 } 
283  
284         for(;k<nj1-offset;k+=4)
285                 {
286                         jnrA        = jjnr[k];   
287                         jnrB        = jjnr[k+1];
288                         jnrC        = jjnr[k+2];
289                         jnrD        = jjnr[k+3];
290             
291             j3A         = 3*jnrA;  
292                         j3B         = 3*jnrB;
293                         j3C         = 3*jnrC;
294                         j3D         = 3*jnrD;
295             
296             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
297             
298             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
299                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
300             
301                         dx          = _mm_sub_ps(ix,jx);
302                         dy          = _mm_sub_ps(iy,jy);
303                         dz          = _mm_sub_ps(iz,jz);
304             
305             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
306             rinv        = gmx_mm_invsqrt_ps(rsq);
307             rinv2       = _mm_mul_ps(rinv,rinv);
308             rinv4       = _mm_mul_ps(rinv2,rinv2);
309             rinv6       = _mm_mul_ps(rinv4,rinv2);
310             
311             rvdw        = _mm_add_ps(rai,raj);
312             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
313             
314             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
315
316             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
317             if(0 == _mm_movemask_ps(mask_cmp))
318             {
319                 /* if ratio>still_p5inv for ALL elements */
320                 ccf         = one;
321                 dccf        = _mm_setzero_ps();
322             }
323             else 
324             {
325                 ratio       = _mm_min_ps(ratio,still_p5inv);
326                 theta       = _mm_mul_ps(ratio,still_pip5);
327                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
328                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
329                 ccf         = _mm_mul_ps(term,term);
330                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
331                                          _mm_mul_ps(sinq,theta));
332             }
333             
334             prod        = _mm_mul_ps(still_p4,vaj);
335             icf4        = _mm_mul_ps(ccf,rinv4);
336             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
337
338             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
339             
340             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
341             
342             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
343             dadx+=4;
344             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
345             dadx+=4;
346                 } 
347         
348         if(offset!=0)
349         {
350             if(offset==1)
351             {
352                 jnrA        = jjnr[k];   
353                 j3A         = 3*jnrA;  
354                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
355                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
356                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
357                 mask        = mask1;
358             } 
359             else if(offset==2)
360             {
361                 jnrA        = jjnr[k];   
362                 jnrB        = jjnr[k+1];
363                 j3A         = 3*jnrA;  
364                 j3B         = 3*jnrB;
365                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
366                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
367                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
368                 mask        = mask2;
369             }
370             else
371             {
372                 /* offset must be 3 */   
373                 jnrA        = jjnr[k];   
374                 jnrB        = jjnr[k+1];
375                 jnrC        = jjnr[k+2];
376                 j3A         = 3*jnrA;  
377                 j3B         = 3*jnrB;
378                 j3C         = 3*jnrC;
379                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
380                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
381                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
382                 mask        = mask3;
383             }
384
385                         dx          = _mm_sub_ps(ix,jx);
386                         dy          = _mm_sub_ps(iy,jy);
387                         dz          = _mm_sub_ps(iz,jz);
388             
389             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
390             rinv        = gmx_mm_invsqrt_ps(rsq);
391             rinv2       = _mm_mul_ps(rinv,rinv);
392             rinv4       = _mm_mul_ps(rinv2,rinv2);
393             rinv6       = _mm_mul_ps(rinv4,rinv2);
394             
395             rvdw        = _mm_add_ps(rai,raj);
396             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
397             
398             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
399             
400             if(0 == _mm_movemask_ps(mask_cmp))
401             {
402                 /* if ratio>still_p5inv for ALL elements */
403                 ccf         = one;
404                 dccf        = _mm_setzero_ps();
405             }
406             else 
407             {
408                 ratio       = _mm_min_ps(ratio,still_p5inv);
409                 theta       = _mm_mul_ps(ratio,still_pip5);
410                 gmx_mm_sincos_ps(theta,&sinq,&cosq);            
411                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
412                 ccf         = _mm_mul_ps(term,term);
413                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
414                                          _mm_mul_ps(sinq,theta));
415             }
416
417             prod        = _mm_mul_ps(still_p4,vaj);
418             icf4        = _mm_mul_ps(ccf,rinv4);
419             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
420             
421             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
422             
423             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
424             dadx+=4;
425             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
426             dadx+=4;
427             
428             tmp = _mm_mul_ps(prod_ai,icf4);
429
430             if(offset==1)
431             {
432                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
433             } 
434             else if(offset==2)
435             {
436                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
437             }
438             else
439             {
440                 /* offset must be 3 */
441                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
442             }
443         }
444         GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
445         }
446
447         /* Sum up the polarization energy from other nodes */
448         if(PARTDECOMP(cr))
449         {
450                 gmx_sum(natoms, work, cr);
451         }
452         else if(DOMAINDECOMP(cr))
453         {
454                 dd_atom_sum_real(cr->dd, work);
455         }
456         
457         /* Compute the radii */
458         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
459         {               
460                 if(born->use[i] != 0)
461                 {
462                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
463                         gpi2             = gpi_ai * gpi_ai;
464                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
465                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
466                 }
467         }
468                 
469         /* Extra (local) communication required for DD */
470         if(DOMAINDECOMP(cr))
471         {
472                 dd_atom_spread_real(cr->dd, born->bRad);
473                 dd_atom_spread_real(cr->dd, fr->invsqrta);
474         }
475     
476         return 0;       
477 }
478
479
480 int 
481 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
482                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
483 {
484         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
485     int jnrA,jnrB,jnrC,jnrD;
486     int j3A,j3B,j3C,j3D;
487     int jnrE,jnrF,jnrG,jnrH;
488     int j3E,j3F,j3G,j3H;
489         float shX,shY,shZ;
490         float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
491         float sum_ai2, sum_ai3,tsum,tchain,doffset;
492         float *obc_param;
493     float *gb_radius;
494     float *work;
495     int *  jjnr;
496     float *dadx;
497     float *shiftvec;
498     float min_rad,rad;
499     
500         __m128 ix,iy,iz,jx,jy,jz;
501         __m128 dx,dy,dz,t1,t2,t3,t4;
502         __m128 rsq,rinv,r;
503         __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
504         __m128 uij,lij2,uij2,lij3,uij3,diff2;
505         __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
506         __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
507         __m128 dadx1,dadx2;
508     __m128 logterm;
509         __m128 mask;
510         __m128 obc_mask1,obc_mask2,obc_mask3;
511     __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
512     __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
513         __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
514         __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
515         __m128 lij_invB,sk2_invB,prodB;
516         __m128 sk_ajB,sk2_ajB,sk2_rinvB;
517         __m128 dadx1B,dadx2B;
518     __m128 logtermB;
519     __m128 obc_mask1B,obc_mask2B,obc_mask3B;
520
521     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
522         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
523         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
524         
525     __m128 oneeighth   = _mm_set1_ps(0.125);
526     __m128 onefourth   = _mm_set1_ps(0.25);
527
528         const __m128 half  = _mm_set1_ps(0.5f);
529         const __m128 three = _mm_set1_ps(3.0f);
530         const __m128 one   = _mm_set1_ps(1.0f);
531         const __m128 two   = _mm_set1_ps(2.0f);
532         const __m128 zero  = _mm_set1_ps(0.0f);
533         const __m128 neg   = _mm_set1_ps(-1.0f);
534         
535         /* Set the dielectric offset */
536         doffset   = born->gb_doffset;
537         gb_radius = born->gb_radius;
538     obc_param = born->param;
539     work      = born->gpol_hct_work;
540     jjnr      = nl->jjnr;
541     dadx      = fr->dadx;
542     shiftvec  = fr->shift_vec[0];
543     
544     jx        = _mm_setzero_ps();
545     jy        = _mm_setzero_ps();
546     jz        = _mm_setzero_ps();
547     
548     jnrA = jnrB = jnrC = jnrD = 0;
549     
550         for(i=0;i<born->nr;i++)
551         {
552                 work[i] = 0;
553         }
554         
555         for(i=0;i<nl->nri;i++)
556         {
557         ii     = nl->iinr[i];
558                 ii3        = ii*3;
559         is3    = 3*nl->shift[i];     
560         shX    = shiftvec[is3];  
561         shY    = shiftvec[is3+1];
562         shZ    = shiftvec[is3+2];
563         nj0    = nl->jindex[i];      
564         nj1    = nl->jindex[i+1];    
565         
566         ix     = _mm_set1_ps(shX+x[ii3+0]);
567                 iy     = _mm_set1_ps(shY+x[ii3+1]);
568                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
569                 
570                 offset = (nj1-nj0)%4;
571
572                 rai    = _mm_load1_ps(gb_radius+ii);
573                 rai_inv= gmx_mm_inv_ps(rai);
574                                 
575                 sum_ai = _mm_setzero_ps();
576                 
577                 sk_ai  = _mm_load1_ps(born->param+ii);
578                 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
579                                 
580                 for(k=nj0;k<nj1-4-offset;k+=8)
581                 {
582                         jnrA        = jjnr[k];   
583                         jnrB        = jjnr[k+1];
584                         jnrC        = jjnr[k+2];
585                         jnrD        = jjnr[k+3];
586                         jnrE        = jjnr[k+4];   
587                         jnrF        = jjnr[k+5];
588                         jnrG        = jjnr[k+6];
589                         jnrH        = jjnr[k+7];
590                         
591             j3A         = 3*jnrA;  
592                         j3B         = 3*jnrB;
593                         j3C         = 3*jnrC;
594                         j3D         = 3*jnrD;
595             j3E         = 3*jnrE;  
596                         j3F         = 3*jnrF;
597                         j3G         = 3*jnrG;
598                         j3H         = 3*jnrH;
599             
600             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
601             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
602             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
603             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
604             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
605             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
606                         
607             dx    = _mm_sub_ps(ix, jx);
608                         dy    = _mm_sub_ps(iy, jy);
609                         dz    = _mm_sub_ps(iz, jz);
610             dxB   = _mm_sub_ps(ix, jxB);
611                         dyB   = _mm_sub_ps(iy, jyB);
612                         dzB   = _mm_sub_ps(iz, jzB);
613                         
614             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
615             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
616                                     
617             rinv        = gmx_mm_invsqrt_ps(rsq);
618             r           = _mm_mul_ps(rsq,rinv);
619             rinvB       = gmx_mm_invsqrt_ps(rsqB);
620             rB          = _mm_mul_ps(rsqB,rinvB);
621
622                         /* Compute raj_inv aj1-4 */
623             raj_inv     = gmx_mm_inv_ps(raj);
624             raj_invB    = gmx_mm_inv_ps(rajB);
625
626             /* Evaluate influence of atom aj -> ai */
627             t1            = _mm_add_ps(r,sk_aj);
628             t2            = _mm_sub_ps(r,sk_aj);
629             t3            = _mm_sub_ps(sk_aj,r);
630             t1B           = _mm_add_ps(rB,sk_ajB);
631             t2B           = _mm_sub_ps(rB,sk_ajB);
632             t3B           = _mm_sub_ps(sk_ajB,rB);
633             obc_mask1     = _mm_cmplt_ps(rai, t1);
634             obc_mask2     = _mm_cmplt_ps(rai, t2);
635             obc_mask3     = _mm_cmplt_ps(rai, t3);
636             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
637             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
638             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
639
640             uij           = gmx_mm_inv_ps(t1);
641             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
642                                       _mm_andnot_ps(obc_mask2,rai_inv));
643             dlij          = _mm_and_ps(one,obc_mask2);
644             uij2          = _mm_mul_ps(uij, uij);
645             uij3          = _mm_mul_ps(uij2,uij);
646             lij2          = _mm_mul_ps(lij, lij);
647             lij3          = _mm_mul_ps(lij2,lij);
648
649             uijB          = gmx_mm_inv_ps(t1B);
650             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
651                                       _mm_andnot_ps(obc_mask2B,rai_inv));
652             dlijB         = _mm_and_ps(one,obc_mask2B);
653             uij2B         = _mm_mul_ps(uijB, uijB);
654             uij3B         = _mm_mul_ps(uij2B,uijB);
655             lij2B         = _mm_mul_ps(lijB, lijB);
656             lij3B         = _mm_mul_ps(lij2B,lijB);
657
658             diff2         = _mm_sub_ps(uij2,lij2);
659             lij_inv       = gmx_mm_invsqrt_ps(lij2);
660             sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
661             sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
662             prod          = _mm_mul_ps(onefourth,sk2_rinv);
663
664             diff2B        = _mm_sub_ps(uij2B,lij2B);
665             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
666             sk2_ajB       = _mm_mul_ps(sk_ajB,sk_ajB);
667             sk2_rinvB     = _mm_mul_ps(sk2_ajB,rinvB);
668             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
669
670             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
671             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
672             
673             t1            = _mm_sub_ps(lij,uij);
674             t2            = _mm_mul_ps(diff2,
675                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
676                                                   prod));
677             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
678             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
679             t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
680             t4            = _mm_and_ps(t4,obc_mask3);
681             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
682             
683             t1B           = _mm_sub_ps(lijB,uijB);
684             t2B           = _mm_mul_ps(diff2B,
685                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
686                                                   prodB));
687             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
688             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
689             t4B           = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
690             t4B           = _mm_and_ps(t4B,obc_mask3B);
691             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
692             
693             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
694             
695             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
696                                        _mm_mul_ps(prod,lij3));
697             t1            = _mm_sub_ps(t1,
698                                        _mm_mul_ps(onefourth,
699                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
700                                                              _mm_mul_ps(lij3,r))));
701             t2            = _mm_mul_ps(onefourth,
702                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
703                                                   _mm_mul_ps(uij3,r)));
704             t2            = _mm_sub_ps(t2,
705                                        _mm_add_ps(_mm_mul_ps(half,uij2),
706                                                       _mm_mul_ps(prod,uij3)));
707             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
708                                        _mm_mul_ps(rinv,rinv));
709             t3            = _mm_sub_ps(t3,
710                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
711                                                   _mm_add_ps(one,
712                                                              _mm_mul_ps(sk2_rinv,rinv))));
713             t1            = _mm_mul_ps(rinv,
714                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
715                                                   _mm_add_ps(t2,t3)));
716             
717
718             
719             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
720                                        _mm_mul_ps(prodB,lij3B));
721             t1B           = _mm_sub_ps(t1B,
722                                        _mm_mul_ps(onefourth,
723                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
724                                                              _mm_mul_ps(lij3B,rB))));
725             t2B           = _mm_mul_ps(onefourth,
726                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
727                                                   _mm_mul_ps(uij3B,rB)));
728             t2B           = _mm_sub_ps(t2B,
729                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
730                                                   _mm_mul_ps(prodB,uij3B)));
731             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
732                                        _mm_mul_ps(rinvB,rinvB));
733             t3B           = _mm_sub_ps(t3B,
734                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
735                                                   _mm_add_ps(one,
736                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
737             t1B           = _mm_mul_ps(rinvB,
738                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
739                                                   _mm_add_ps(t2B,t3B)));
740             
741             dadx1         = _mm_and_ps(t1,obc_mask1);
742             dadx1B        = _mm_and_ps(t1B,obc_mask1B);
743
744
745             /* Evaluate influence of atom ai -> aj */
746             t1            = _mm_add_ps(r,sk_ai);
747             t2            = _mm_sub_ps(r,sk_ai);
748             t3            = _mm_sub_ps(sk_ai,r);
749             t1B           = _mm_add_ps(rB,sk_ai);
750             t2B           = _mm_sub_ps(rB,sk_ai);
751             t3B           = _mm_sub_ps(sk_ai,rB);
752             obc_mask1     = _mm_cmplt_ps(raj, t1);
753             obc_mask2     = _mm_cmplt_ps(raj, t2);
754             obc_mask3     = _mm_cmplt_ps(raj, t3);
755             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
756             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
757             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
758             
759             uij           = gmx_mm_inv_ps(t1);
760             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
761                                       _mm_andnot_ps(obc_mask2,raj_inv));
762             dlij          = _mm_and_ps(one,obc_mask2);
763             uij2          = _mm_mul_ps(uij, uij);
764             uij3          = _mm_mul_ps(uij2,uij);
765             lij2          = _mm_mul_ps(lij, lij);
766             lij3          = _mm_mul_ps(lij2,lij);
767
768             uijB          = gmx_mm_inv_ps(t1B);
769             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
770                                       _mm_andnot_ps(obc_mask2B,raj_invB));
771             dlijB         = _mm_and_ps(one,obc_mask2B);
772             uij2B         = _mm_mul_ps(uijB, uijB);
773             uij3B         = _mm_mul_ps(uij2B,uijB);
774             lij2B         = _mm_mul_ps(lijB, lijB);
775             lij3B         = _mm_mul_ps(lij2B,lijB);
776
777             diff2         = _mm_sub_ps(uij2,lij2);
778             lij_inv       = gmx_mm_invsqrt_ps(lij2);
779             sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
780             prod          = _mm_mul_ps(onefourth,sk2_rinv);
781
782             diff2B        = _mm_sub_ps(uij2B,lij2B);
783             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
784             sk2_rinvB     = _mm_mul_ps(sk2_ai,rinvB);
785             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
786
787             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
788             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
789
790             t1            = _mm_sub_ps(lij,uij);
791             t2            = _mm_mul_ps(diff2,
792                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
793                                                   prod));
794             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
795             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
796             t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
797             t4            = _mm_and_ps(t4,obc_mask3);
798             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
799
800             t1B           = _mm_sub_ps(lijB,uijB);
801             t2B           = _mm_mul_ps(diff2B,
802                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
803                                                   prodB));
804             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
805             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
806             t4B           = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
807             t4B           = _mm_and_ps(t4B,obc_mask3B);
808             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
809             
810             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
811             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
812             
813             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
814                                        _mm_mul_ps(prod,lij3));
815             t1            = _mm_sub_ps(t1,
816                                        _mm_mul_ps(onefourth,
817                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
818                                                              _mm_mul_ps(lij3,r))));
819             t2            = _mm_mul_ps(onefourth,
820                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
821                                                   _mm_mul_ps(uij3,r)));
822             t2            = _mm_sub_ps(t2,
823                                        _mm_add_ps(_mm_mul_ps(half,uij2),
824                                                   _mm_mul_ps(prod,uij3)));
825             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
826                                        _mm_mul_ps(rinv,rinv));
827             t3            = _mm_sub_ps(t3,
828                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
829                                                   _mm_add_ps(one,
830                                                              _mm_mul_ps(sk2_rinv,rinv))));
831             t1            = _mm_mul_ps(rinv,
832                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
833                                                   _mm_add_ps(t2,t3)));
834
835             
836             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
837                                        _mm_mul_ps(prodB,lij3B));
838             t1B           = _mm_sub_ps(t1B,
839                                        _mm_mul_ps(onefourth,
840                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
841                                                              _mm_mul_ps(lij3B,rB))));
842             t2B           = _mm_mul_ps(onefourth,
843                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
844                                                   _mm_mul_ps(uij3B,rB)));
845             t2B           = _mm_sub_ps(t2B,
846                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
847                                                   _mm_mul_ps(prodB,uij3B)));
848             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
849                                        _mm_mul_ps(rinvB,rinvB));
850             t3B           = _mm_sub_ps(t3B,
851                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
852                                                   _mm_add_ps(one,
853                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
854             t1B           = _mm_mul_ps(rinvB,
855                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
856                                                   _mm_add_ps(t2B,t3B)));
857
858             
859             dadx2         = _mm_and_ps(t1,obc_mask1);
860             dadx2B        = _mm_and_ps(t1B,obc_mask1B);
861             
862             _mm_store_ps(dadx,dadx1);
863             dadx += 4;
864             _mm_store_ps(dadx,dadx2);
865             dadx += 4;
866             _mm_store_ps(dadx,dadx1B);
867             dadx += 4;
868             _mm_store_ps(dadx,dadx2B);
869             dadx += 4;
870             
871         } /* end normal inner loop */
872         
873                 for(;k<nj1-offset;k+=4)
874                 {
875                         jnrA        = jjnr[k];   
876                         jnrB        = jjnr[k+1];
877                         jnrC        = jjnr[k+2];
878                         jnrD        = jjnr[k+3];
879                         
880             j3A         = 3*jnrA;  
881                         j3B         = 3*jnrB;
882                         j3C         = 3*jnrC;
883                         j3D         = 3*jnrD;
884             
885             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
886             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
887             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
888                         
889             dx    = _mm_sub_ps(ix, jx);
890                         dy    = _mm_sub_ps(iy, jy);
891                         dz    = _mm_sub_ps(iz, jz);
892                         
893             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
894             
895             rinv        = gmx_mm_invsqrt_ps(rsq);
896             r           = _mm_mul_ps(rsq,rinv);
897             
898                         /* Compute raj_inv aj1-4 */
899             raj_inv     = gmx_mm_inv_ps(raj);
900             
901             /* Evaluate influence of atom aj -> ai */
902             t1            = _mm_add_ps(r,sk_aj);
903             obc_mask1     = _mm_cmplt_ps(rai, t1);
904             
905             if(_mm_movemask_ps(obc_mask1))
906             {
907                 /* If any of the elements has rai<dr+sk, this is executed */
908                 t2            = _mm_sub_ps(r,sk_aj);
909                 t3            = _mm_sub_ps(sk_aj,r);
910                 
911                 obc_mask2     = _mm_cmplt_ps(rai, t2);
912                 obc_mask3     = _mm_cmplt_ps(rai, t3);
913                 
914                 uij           = gmx_mm_inv_ps(t1);
915                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
916                                           _mm_andnot_ps(obc_mask2,rai_inv));
917                 dlij          = _mm_and_ps(one,obc_mask2);
918                 uij2          = _mm_mul_ps(uij, uij);
919                 uij3          = _mm_mul_ps(uij2,uij);
920                 lij2          = _mm_mul_ps(lij, lij);
921                 lij3          = _mm_mul_ps(lij2,lij);
922                 diff2         = _mm_sub_ps(uij2,lij2);
923                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
924                 sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
925                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
926                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
927                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
928                 t1            = _mm_sub_ps(lij,uij);
929                 t2            = _mm_mul_ps(diff2,
930                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
931                                                       prod));
932                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
933                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
934                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
935                 t4            = _mm_and_ps(t4,obc_mask3);
936                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
937                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
938                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
939                                            _mm_mul_ps(prod,lij3));
940                 t1            = _mm_sub_ps(t1,
941                                            _mm_mul_ps(onefourth,
942                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
943                                                                  _mm_mul_ps(lij3,r))));
944                 t2            = _mm_mul_ps(onefourth,
945                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
946                                                       _mm_mul_ps(uij3,r)));
947                 t2            = _mm_sub_ps(t2,
948                                            _mm_add_ps(_mm_mul_ps(half,uij2),
949                                                       _mm_mul_ps(prod,uij3)));
950                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
951                                            _mm_mul_ps(rinv,rinv));
952                 t3            = _mm_sub_ps(t3,
953                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
954                                                       _mm_add_ps(one,
955                                                                  _mm_mul_ps(sk2_rinv,rinv))));
956                 t1            = _mm_mul_ps(rinv,
957                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
958                                                       _mm_add_ps(t2,t3)));
959                 
960                 dadx1         = _mm_and_ps(t1,obc_mask1);
961             }
962             else 
963             {
964                 dadx1         = _mm_setzero_ps();
965             }
966             
967             /* Evaluate influence of atom ai -> aj */
968             t1            = _mm_add_ps(r,sk_ai);
969             obc_mask1     = _mm_cmplt_ps(raj, t1);
970             
971             if(_mm_movemask_ps(obc_mask1))
972             {
973                 t2            = _mm_sub_ps(r,sk_ai);
974                 t3            = _mm_sub_ps(sk_ai,r);
975                 obc_mask2     = _mm_cmplt_ps(raj, t2);
976                 obc_mask3     = _mm_cmplt_ps(raj, t3);
977                 
978                 uij           = gmx_mm_inv_ps(t1);
979                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
980                                           _mm_andnot_ps(obc_mask2,raj_inv));
981                 dlij          = _mm_and_ps(one,obc_mask2);
982                 uij2          = _mm_mul_ps(uij, uij);
983                 uij3          = _mm_mul_ps(uij2,uij);
984                 lij2          = _mm_mul_ps(lij, lij);
985                 lij3          = _mm_mul_ps(lij2,lij);
986                 diff2         = _mm_sub_ps(uij2,lij2);
987                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
988                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
989                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
990                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
991                 t1            = _mm_sub_ps(lij,uij);
992                 t2            = _mm_mul_ps(diff2,
993                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
994                                                       prod));
995                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
996                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
997                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
998                 t4            = _mm_and_ps(t4,obc_mask3);
999                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1000                 
1001                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
1002                 
1003                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1004                                            _mm_mul_ps(prod,lij3));
1005                 t1            = _mm_sub_ps(t1,
1006                                            _mm_mul_ps(onefourth,
1007                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1008                                                                  _mm_mul_ps(lij3,r))));
1009                 t2            = _mm_mul_ps(onefourth,
1010                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1011                                                       _mm_mul_ps(uij3,r)));
1012                 t2            = _mm_sub_ps(t2,
1013                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1014                                                       _mm_mul_ps(prod,uij3)));
1015                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1016                                            _mm_mul_ps(rinv,rinv));
1017                 t3            = _mm_sub_ps(t3,
1018                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1019                                                       _mm_add_ps(one,
1020                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1021                 t1            = _mm_mul_ps(rinv,
1022                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1023                                                       _mm_add_ps(t2,t3)));
1024                 dadx2         = _mm_and_ps(t1,obc_mask1);
1025             }
1026             else
1027             {
1028                 dadx2         = _mm_setzero_ps();
1029             }
1030             
1031             _mm_store_ps(dadx,dadx1);
1032             dadx += 4;
1033             _mm_store_ps(dadx,dadx2);
1034             dadx += 4;            
1035         } /* end normal inner loop */
1036         
1037         if(offset!=0)
1038         {
1039             if(offset==1)
1040             {
1041                 jnrA        = jjnr[k];   
1042                 j3A         = 3*jnrA;  
1043                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1044                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1045                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1046                 mask        = mask1;
1047             } 
1048             else if(offset==2)
1049             {
1050                 jnrA        = jjnr[k];   
1051                 jnrB        = jjnr[k+1];
1052                 j3A         = 3*jnrA;  
1053                 j3B         = 3*jnrB;
1054                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1055                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1056                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1057                 mask        = mask2;
1058             }
1059             else
1060             {
1061                 /* offset must be 3 */   
1062                 jnrA        = jjnr[k];   
1063                 jnrB        = jjnr[k+1];
1064                 jnrC        = jjnr[k+2];
1065                 j3A         = 3*jnrA;  
1066                 j3B         = 3*jnrB;
1067                 j3C         = 3*jnrC;
1068                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1069                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1070                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1071                 mask        = mask3;
1072             }
1073
1074             dx    = _mm_sub_ps(ix, jx);
1075                         dy    = _mm_sub_ps(iy, jy);
1076                         dz    = _mm_sub_ps(iz, jz);
1077                         
1078             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
1079             
1080             rinv        = gmx_mm_invsqrt_ps(rsq);
1081             r           = _mm_mul_ps(rsq,rinv);
1082             
1083                         /* Compute raj_inv aj1-4 */
1084             raj_inv     = gmx_mm_inv_ps(raj);
1085             
1086             /* Evaluate influence of atom aj -> ai */
1087             t1            = _mm_add_ps(r,sk_aj);
1088             obc_mask1     = _mm_cmplt_ps(rai, t1);
1089             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1090
1091             if(_mm_movemask_ps(obc_mask1))
1092             {
1093                 t2            = _mm_sub_ps(r,sk_aj);
1094                 t3            = _mm_sub_ps(sk_aj,r);
1095                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1096                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1097                 
1098                 uij           = gmx_mm_inv_ps(t1);
1099                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1100                                           _mm_andnot_ps(obc_mask2,rai_inv));
1101                 dlij          = _mm_and_ps(one,obc_mask2);
1102                 uij2          = _mm_mul_ps(uij, uij);
1103                 uij3          = _mm_mul_ps(uij2,uij);
1104                 lij2          = _mm_mul_ps(lij, lij);
1105                 lij3          = _mm_mul_ps(lij2,lij);
1106                 diff2         = _mm_sub_ps(uij2,lij2);
1107                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1108                 sk2_aj         = _mm_mul_ps(sk_aj,sk_aj);
1109                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
1110                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1111                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1112                 t1            = _mm_sub_ps(lij,uij);
1113                 t2            = _mm_mul_ps(diff2,
1114                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1115                                                       prod));
1116                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1117                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1118                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1119                 t4            = _mm_and_ps(t4,obc_mask3);
1120                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1121                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1122                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1123                                            _mm_mul_ps(prod,lij3));
1124                 t1            = _mm_sub_ps(t1,
1125                                            _mm_mul_ps(onefourth,
1126                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1127                                                                  _mm_mul_ps(lij3,r))));
1128                 t2            = _mm_mul_ps(onefourth,
1129                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1130                                                       _mm_mul_ps(uij3,r)));
1131                 t2            = _mm_sub_ps(t2,
1132                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1133                                                       _mm_mul_ps(prod,uij3)));
1134                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1135                                            _mm_mul_ps(rinv,rinv));
1136                 t3            = _mm_sub_ps(t3,
1137                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1138                                                       _mm_add_ps(one,
1139                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1140                 t1            = _mm_mul_ps(rinv,
1141                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1142                                                       _mm_add_ps(t2,t3)));
1143                 dadx1         = _mm_and_ps(t1,obc_mask1);
1144             }
1145             else
1146             {
1147                 dadx1         = _mm_setzero_ps();
1148             }
1149
1150                 /* Evaluate influence of atom ai -> aj */
1151             t1            = _mm_add_ps(r,sk_ai);
1152             obc_mask1     = _mm_cmplt_ps(raj, t1);
1153             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1154             
1155             if(_mm_movemask_ps(obc_mask1))
1156             {
1157                 t2            = _mm_sub_ps(r,sk_ai);
1158                 t3            = _mm_sub_ps(sk_ai,r);
1159                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1160                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1161             
1162                 uij           = gmx_mm_inv_ps(t1);
1163                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1164                                           _mm_andnot_ps(obc_mask2,raj_inv));
1165                 dlij          = _mm_and_ps(one,obc_mask2);
1166                 uij2          = _mm_mul_ps(uij, uij);
1167                 uij3          = _mm_mul_ps(uij2,uij);
1168                 lij2          = _mm_mul_ps(lij, lij);
1169                 lij3          = _mm_mul_ps(lij2,lij);
1170                 diff2         = _mm_sub_ps(uij2,lij2);
1171                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1172                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
1173                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1174                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1175                 t1            = _mm_sub_ps(lij,uij);
1176                 t2            = _mm_mul_ps(diff2,
1177                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1178                                                       prod));
1179                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1180                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1181                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1182                 t4            = _mm_and_ps(t4,obc_mask3);
1183                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1184                 
1185                 tmp           = _mm_and_ps(t1,obc_mask1);
1186                 
1187                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1188                                            _mm_mul_ps(prod,lij3));
1189                 t1            = _mm_sub_ps(t1,
1190                                            _mm_mul_ps(onefourth,
1191                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1192                                                                  _mm_mul_ps(lij3,r))));
1193                 t2            = _mm_mul_ps(onefourth,
1194                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1195                                                       _mm_mul_ps(uij3,r)));
1196                 t2            = _mm_sub_ps(t2,
1197                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1198                                                       _mm_mul_ps(prod,uij3)));
1199                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1200                                            _mm_mul_ps(rinv,rinv));
1201                 t3            = _mm_sub_ps(t3,
1202                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1203                                                       _mm_add_ps(one,
1204                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1205                 t1            = _mm_mul_ps(rinv,
1206                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1207                                                       _mm_add_ps(t2,t3)));
1208                 dadx2         = _mm_and_ps(t1,obc_mask1);
1209             }
1210             else
1211             {
1212                 dadx2         = _mm_setzero_ps();
1213                 tmp           = _mm_setzero_ps();
1214             }
1215             
1216             _mm_store_ps(dadx,dadx1);
1217             dadx += 4;
1218             _mm_store_ps(dadx,dadx2);
1219             dadx += 4;
1220             
1221             if(offset==1)
1222             {
1223                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1224             } 
1225             else if(offset==2)
1226             {
1227                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1228             }
1229             else
1230             {
1231                 /* offset must be 3 */
1232                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1233             }
1234             
1235         }
1236         GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1237         
1238         }
1239         
1240         /* Parallel summations */
1241         if(PARTDECOMP(cr))
1242         {
1243                 gmx_sum(natoms, work, cr);
1244         }
1245         else if(DOMAINDECOMP(cr))
1246         {
1247                 dd_atom_sum_real(cr->dd, work);
1248         }
1249         
1250     if(gb_algorithm==egbHCT)
1251     {
1252         /* HCT */
1253         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1254         {
1255                         if(born->use[i] != 0)
1256             {
1257                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
1258                 sum     = 1.0/rr - work[i];
1259                 min_rad = rr + doffset;
1260                 rad     = 1.0/sum; 
1261                 
1262                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1263                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1264             }
1265         }
1266         
1267         /* Extra communication required for DD */
1268         if(DOMAINDECOMP(cr))
1269         {
1270             dd_atom_spread_real(cr->dd, born->bRad);
1271             dd_atom_spread_real(cr->dd, fr->invsqrta);
1272         }
1273     }
1274     else
1275     {
1276         /* OBC */
1277         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1278         {
1279                         if(born->use[i] != 0)
1280             {
1281                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1282                 rr_inv2 = 1.0/rr;
1283                 rr      = rr-doffset; 
1284                 rr_inv  = 1.0/rr;
1285                 sum     = rr * work[i];
1286                 sum2    = sum  * sum;
1287                 sum3    = sum2 * sum;
1288                 
1289                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1290                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1291                 born->bRad[i] = 1.0 / born->bRad[i];
1292                 
1293                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1294                 
1295                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1296                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1297             }
1298         }
1299         /* Extra (local) communication required for DD */
1300         if(DOMAINDECOMP(cr))
1301         {
1302             dd_atom_spread_real(cr->dd, born->bRad);
1303             dd_atom_spread_real(cr->dd, fr->invsqrta);
1304             dd_atom_spread_real(cr->dd, born->drobc);
1305         }
1306     }
1307
1308         
1309         
1310         return 0;
1311 }
1312
1313
1314
1315 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda, 
1316                                     float *x, float *f, float *fshift, float *shiftvec,
1317                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                               
1318 {
1319         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1320         int        jnrA,jnrB,jnrC,jnrD;
1321     int    j3A,j3B,j3C,j3D;
1322         int        jnrE,jnrF,jnrG,jnrH;
1323     int    j3E,j3F,j3G,j3H;
1324         int *  jjnr;
1325     
1326         float   rbi,shX,shY,shZ;
1327         float   *rb;
1328     
1329         __m128 ix,iy,iz;
1330         __m128 jx,jy,jz;
1331         __m128 jxB,jyB,jzB;
1332         __m128 fix,fiy,fiz;
1333         __m128 dx,dy,dz;
1334     __m128 tx,ty,tz;
1335         __m128 dxB,dyB,dzB;
1336     __m128 txB,tyB,tzB;
1337
1338         __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1339         __m128 xmm1,xmm2,xmm3;
1340         
1341         const __m128 two = _mm_set1_ps(2.0f);
1342     
1343         rb     = born->work; 
1344                         
1345     jjnr   = nl->jjnr;
1346     
1347         /* Loop to get the proper form for the Born radius term, sse style */
1348         offset=natoms%4;
1349         
1350   n0 = 0;
1351   n1 = natoms;
1352   
1353         if(gb_algorithm==egbSTILL) 
1354         {
1355                 for(i=n0;i<n1;i++)
1356                 {
1357       rbi   = born->bRad[i];
1358                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1359                 }
1360         }
1361         else if(gb_algorithm==egbHCT) 
1362         {
1363                 for(i=n0;i<n1;i++)
1364                 {
1365       rbi   = born->bRad[i];
1366                         rb[i] = rbi * rbi * dvda[i];
1367                 }
1368         }
1369         else if(gb_algorithm==egbOBC) 
1370         {
1371                 for(i=n0;i<n1;i++)
1372                 {
1373       rbi   = born->bRad[i];
1374                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1375                 }
1376         }
1377     
1378     jz = _mm_setzero_ps();
1379     
1380     n = j3A = j3B = j3C = j3D = 0;
1381     
1382         for(i=0;i<nl->nri;i++)
1383         {
1384         ii     = nl->iinr[i];
1385                 ii3        = ii*3;
1386         is3    = 3*nl->shift[i];     
1387         shX    = shiftvec[is3];  
1388         shY    = shiftvec[is3+1];
1389         shZ    = shiftvec[is3+2];
1390         nj0    = nl->jindex[i];      
1391         nj1    = nl->jindex[i+1];    
1392
1393         ix     = _mm_set1_ps(shX+x[ii3+0]);
1394                 iy     = _mm_set1_ps(shY+x[ii3+1]);
1395                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
1396                 
1397                 offset = (nj1-nj0)%4;
1398                 
1399                 rbai   = _mm_load1_ps(rb+ii);                   
1400                 fix    = _mm_setzero_ps();
1401                 fiy    = _mm_setzero_ps();
1402                 fiz    = _mm_setzero_ps();      
1403                                 
1404
1405         for(k=nj0;k<nj1-offset;k+=4)
1406                 {
1407                         jnrA        = jjnr[k];   
1408                         jnrB        = jjnr[k+1];
1409                         jnrC        = jjnr[k+2];
1410                         jnrD        = jjnr[k+3];
1411             
1412             j3A         = 3*jnrA;  
1413                         j3B         = 3*jnrB;
1414                         j3C         = 3*jnrC;
1415                         j3D         = 3*jnrD;
1416             
1417             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1418             
1419                         dx          = _mm_sub_ps(ix,jx);
1420                         dy          = _mm_sub_ps(iy,jy);
1421                         dz          = _mm_sub_ps(iz,jz);
1422             
1423             GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1424             
1425                         /* load chain rule terms for j1-4 */
1426                         f_gb        = _mm_load_ps(dadx);
1427                         dadx += 4;
1428                         f_gb_ai     = _mm_load_ps(dadx);
1429                         dadx += 4;
1430                         
1431             /* calculate scalar force */
1432             f_gb    = _mm_mul_ps(f_gb,rbai); 
1433             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1434             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1435             
1436             tx     = _mm_mul_ps(f_gb,dx);
1437             ty     = _mm_mul_ps(f_gb,dy);
1438             tz     = _mm_mul_ps(f_gb,dz);
1439             
1440             fix    = _mm_add_ps(fix,tx);
1441             fiy    = _mm_add_ps(fiy,ty);
1442             fiz    = _mm_add_ps(fiz,tz);
1443             
1444             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1445                 }
1446         
1447                 /*deal with odd elements */
1448                 if(offset!=0) 
1449         {
1450             if(offset==1)
1451             {
1452                 jnrA        = jjnr[k];   
1453                 j3A         = 3*jnrA; 
1454                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1455                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1456             } 
1457             else if(offset==2)
1458             {
1459                 jnrA        = jjnr[k];   
1460                 jnrB        = jjnr[k+1];
1461                 j3A         = 3*jnrA;  
1462                 j3B         = 3*jnrB;
1463                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1464                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1465             }
1466             else
1467             {
1468                 /* offset must be 3 */   
1469                 jnrA        = jjnr[k];   
1470                 jnrB        = jjnr[k+1];
1471                 jnrC        = jjnr[k+2];
1472                 j3A         = 3*jnrA;  
1473                 j3B         = 3*jnrB;
1474                 j3C         = 3*jnrC;
1475                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1476                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1477             }
1478             
1479             dx          = _mm_sub_ps(ix,jx);
1480             dy          = _mm_sub_ps(iy,jy);
1481             dz          = _mm_sub_ps(iz,jz);
1482             
1483             /* load chain rule terms for j1-4 */
1484             f_gb        = _mm_load_ps(dadx);
1485             dadx += 4;
1486             f_gb_ai     = _mm_load_ps(dadx);
1487             dadx += 4;
1488             
1489             /* calculate scalar force */
1490             f_gb    = _mm_mul_ps(f_gb,rbai); 
1491             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1492             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1493             
1494             tx     = _mm_mul_ps(f_gb,dx);
1495             ty     = _mm_mul_ps(f_gb,dy);
1496             tz     = _mm_mul_ps(f_gb,dz);
1497             
1498             fix    = _mm_add_ps(fix,tx);
1499             fiy    = _mm_add_ps(fiy,ty);
1500             fiz    = _mm_add_ps(fiz,tz);
1501             
1502             if(offset==1)
1503             {
1504                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1505             } 
1506             else if(offset==2)
1507             {
1508                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1509             }
1510             else
1511             {
1512                 /* offset must be 3 */
1513                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1514             }
1515         } 
1516         
1517                 /* fix/fiy/fiz now contain four partial force terms, that all should be
1518          * added to the i particle forces and shift forces. 
1519          */
1520                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1521         }       
1522     
1523         return 0;       
1524 }
1525
1526
1527 #else
1528 /* keep compiler happy */
1529 int genborn_sse_dummy;
1530
1531 #endif /* SSE intrinsics available */