Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel430_sse2_single.c
1 /*
2  * Copyright (c) Erik Lindahl, David van der Spoel 2003
3  * 
4  * This file is generated automatically at compile time
5  * by the program mknb in the Gromacs distribution.
6  *
7  * Options used when generation this file:
8  * Language:         c
9  * Precision:        single
10  * Threads:          no
11  * Software invsqrt: yes
12  * Prefetch forces:  no
13  * Comments:         no
14  */
15 #ifdef HAVE_CONFIG_H
16 #include<config.h>
17 #endif
18 #include<math.h>
19 #include<vec.h>
20
21
22 #include <emmintrin.h>
23
24 #include <gmx_sse2_single.h>
25
26 /* get gmx_gbdata_t */
27 #include "../nb_kerneltype.h"
28
29
30 void nb_kernel430_sse2_single(int *           p_nri,
31                                                    int *           iinr,
32                                                    int *           jindex,
33                                                    int *           jjnr,
34                                                    int *           shift,
35                                                    float *         shiftvec,
36                                                    float *         fshift,
37                                                    int *           gid,
38                                                    float *         pos,
39                                                    float *         faction,
40                                                    float *         charge,
41                                                    float *         p_facel,
42                                                    float *         p_krf,
43                                                    float *         p_crf,
44                                                    float *         Vc,
45                                                    int *           type,
46                                                    int *           p_ntype,
47                                                    float *         vdwparam,
48                                                    float *         Vvdw,
49                                                    float *         p_tabscale,
50                                                    float *         VFtab,
51                                                    float *         invsqrta,
52                                                    float *         dvda,
53                                                    float *         p_gbtabscale,
54                                                    float *         GBtab,
55                                                    int *           p_nthreads,
56                                                    int *           count,
57                                                    void *          mtx,
58                                                    int *           outeriter,
59                                                    int *           inneriter,
60                                                    float *         work)
61 {
62     int           nri,ntype,nthreads;
63     float         facel,krf,crf,tabscale,gbtabscale;
64     int           n,ii,is3,ii3,k,nj0,nj1,ggid;
65     float         shX,shY,shZ;
66         int                       offset,nti;
67         int           jnr,jnr2,jnr3,jnr4,j3,j23,j33,j43;
68         int           tj,tj2,tj3,tj4;
69         gmx_gbdata_t *gbdata;
70         float *        gpol;
71         
72         __m128   iq,qq,q,isai;
73         __m128   ix,iy,iz;
74         __m128   jx,jy,jz;
75         __m128   xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
76         __m128   dx1,dy1,dz1;
77         __m128   vctot,Vvdwtot,dvdasum;
78         __m128   fix,fiy,fiz,rsq;
79         __m128   t1,t2,t3;
80         __m128   rinv,isaj,isaprod;
81         __m128   vcoul,fscal,gbscale,c6,c12;
82         __m128   rinvsq,dvdaj,r,rt;
83         __m128   eps,eps2,Y,F,G,H,Geps,Heps2;
84         __m128   Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp;
85         __m128   rinvsix,Vvdw6,Vvdw12,Vvdwtmp,vgbtot,n0f;
86         __m128   fac_sse,tabscale_sse,gbtabscale_sse;
87         
88         __m128i  n0, nnn;
89         const __m128 neg    = _mm_set1_ps(-1.0f);
90         const __m128 zero   = _mm_set1_ps(0.0f);
91     const __m128 half   = _mm_set1_ps(0.5f);
92         const __m128 two    = _mm_set1_ps(2.0f);
93         const __m128 three  = _mm_set1_ps(3.0f);
94         const __m128 six    = _mm_set1_ps(6.0f);
95     const __m128 twelwe = _mm_set1_ps(12.0f);
96         
97         __m128i four        = _mm_set1_epi32(4);
98         __m128i maski       = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);     
99         __m128i mask        = _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff);   
100         
101         float vct,vdwt,vgbt,dva,isai_f;
102         
103         gbdata          = (gmx_gbdata_t *)work;
104         gpol            = gbdata->gpol;
105                 
106         nri              = *p_nri;         
107     ntype            = *p_ntype;       
108     nthreads         = *p_nthreads;    
109     facel            = (*p_facel) * ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent));       
110     krf              = *p_krf;         
111     crf              = *p_crf;         
112     tabscale         = *p_tabscale;    
113     gbtabscale       = *p_gbtabscale;  
114     nj1              = 0;
115
116         /* Splat variables */
117         fac_sse        = _mm_load1_ps(&facel);
118         tabscale_sse   = _mm_load1_ps(&tabscale);
119         gbtabscale_sse = _mm_load1_ps(&gbtabscale);
120         
121         
122         /* Keep the compiler happy */
123         Vvdwtmp = _mm_setzero_ps();
124         dvdatmp = _mm_setzero_ps();
125         vcoul   = _mm_setzero_ps();
126         vgb     = _mm_setzero_ps();
127         t1      = _mm_setzero_ps();
128         t2      = _mm_setzero_ps();
129         t3      = _mm_setzero_ps();
130         xmm1    = _mm_setzero_ps();
131         xmm2    = _mm_setzero_ps();
132         xmm3    = _mm_setzero_ps();
133         xmm4    = _mm_setzero_ps();
134         j23     = j33  = 0;
135         jnr2    = jnr3 = 0;
136         
137     for(n=0; (n<nri); n++)
138     {
139         is3              = 3*shift[n];     
140         shX              = shiftvec[is3];  
141         shY              = shiftvec[is3+1];
142         shZ              = shiftvec[is3+2];
143         nj0              = jindex[n];      
144         nj1              = jindex[n+1];    
145         ii               = iinr[n];        
146         ii3              = 3*ii;           
147         
148                 ix               = _mm_set1_ps(shX+pos[ii3+0]);
149                 iy               = _mm_set1_ps(shY+pos[ii3+1]);
150                 iz               = _mm_set1_ps(shZ+pos[ii3+2]);
151                 
152                 iq               = _mm_load1_ps(charge+ii);
153                 iq               = _mm_mul_ps(iq,fac_sse);
154                 
155         isai_f           = invsqrta[ii];   
156                 isai             = _mm_load1_ps(&isai_f);
157                 
158                 nti              = 2*ntype*type[ii];
159                 
160                 vctot            = _mm_setzero_ps();
161                 Vvdwtot          = _mm_setzero_ps();
162                 dvdasum          = _mm_setzero_ps();
163                 vgbtot           = _mm_setzero_ps();
164                 fix              = _mm_setzero_ps();
165                 fiy              = _mm_setzero_ps();
166                 fiz              = _mm_setzero_ps();
167                 
168                 offset           = (nj1-nj0)%4;
169         
170         for(k=nj0;k<nj1-offset;k+=4)
171                 {
172                         jnr     = jjnr[k];   
173                         jnr2    = jjnr[k+1];
174                         jnr3    = jjnr[k+2];
175                         jnr4    = jjnr[k+3];
176                         
177             j3      = 3*jnr;  
178                         j23             = 3*jnr2;
179                         j33     = 3*jnr3;
180                         j43     = 3*jnr4;
181                         
182                         xmm1    = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));  /* x1 y1 - - */
183                         xmm2    = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23)); /* x2 y2 - - */
184                         xmm3    = _mm_loadh_pi(xmm3, (__m64 *) (pos+j33)); /* x3 y3 - - */
185                         xmm4    = _mm_loadh_pi(xmm4, (__m64 *) (pos+j43)); /* x4 y4 - - */                      
186                         
187                         xmm5    = _mm_load1_ps(pos+j3+2);  
188                         xmm6    = _mm_load1_ps(pos+j23+2); 
189                         xmm7    = _mm_load1_ps(pos+j33+2); 
190                         xmm8    = _mm_load1_ps(pos+j43+2);
191                         
192                         /* transpose */
193                         xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
194                         xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
195                         jz      = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
196                         
197                         xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
198                         xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
199                         jx      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
200                         jy      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
201                         
202                         dx1     = _mm_sub_ps(ix,jx);
203                         dy1     = _mm_sub_ps(iy,jy);
204                         dz1     = _mm_sub_ps(iz,jz);
205                         
206                         t1      = _mm_mul_ps(dx1,dx1);
207                         t2      = _mm_mul_ps(dy1,dy1);
208                         t3      = _mm_mul_ps(dz1,dz1);
209                         
210                         rsq     = _mm_add_ps(t1,t2);
211                         rsq     = _mm_add_ps(rsq,t3);
212                         rinv    = gmx_mm_invsqrt_ps(rsq);
213                         
214                         xmm1    = _mm_load_ss(invsqrta+jnr); 
215                         xmm2    = _mm_load_ss(invsqrta+jnr2);
216                         xmm3    = _mm_load_ss(invsqrta+jnr3);
217                         xmm4    = _mm_load_ss(invsqrta+jnr4);
218                         
219                         xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
220                         xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
221                         isaj    = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
222                         
223                         isaprod = _mm_mul_ps(isai,isaj);
224                         
225                         xmm1    = _mm_load_ss(charge+jnr); 
226                         xmm2    = _mm_load_ss(charge+jnr2); 
227                         xmm3    = _mm_load_ss(charge+jnr3); 
228                         xmm4    = _mm_load_ss(charge+jnr4);
229                         
230                         xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
231                         xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
232                         q       = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
233                         qq      = _mm_mul_ps(iq,q);
234                         
235                         vcoul   = _mm_mul_ps(qq,rinv);
236                         fscal   = _mm_mul_ps(vcoul,rinv);
237                         qq      = _mm_mul_ps(qq,neg);
238                         qq      = _mm_mul_ps(isaprod,qq);
239                         gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
240                         
241             tj      = nti+2*type[jnr];
242                         tj2     = nti+2*type[jnr2];
243                         tj3     = nti+2*type[jnr3];
244                         tj4     = nti+2*type[jnr4];
245                         
246                         xmm1    = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
247                         xmm1    = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
248                         xmm2    = _mm_loadl_pi(xmm2,(__m64 *) (vdwparam+tj3));
249                         xmm2    = _mm_loadh_pi(xmm2,(__m64 *) (vdwparam+tj4));
250                         
251                         xmm3    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2)); /* c61 c62 c61 c62 */
252                         xmm4    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(0,2,0,2)); /* c63 c64 c63 c64 */
253                         c6      = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,1,0,1)); /* c61 c62 c63 c64 */
254                         
255                         xmm3    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /* c121 c122 c121 c122 */
256                         xmm4    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,3,1)); /* c123 c124 c123 c124 */
257                         c12     = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(1,0,1,0)); /* c121 c122 c123 c124 */             
258                                 
259                         xmm1    = _mm_load_ss(dvda+jnr); 
260                         xmm2    = _mm_load_ss(dvda+jnr2); 
261                         xmm3    = _mm_load_ss(dvda+jnr3);
262                         xmm4    = _mm_load_ss(dvda+jnr4);
263                         
264                         xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /* j1 j1 j2 j2 */
265                         xmm3    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /* j3 j3 j4 j4 */
266                         dvdaj   = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
267                         
268                         /* Calculate GB table index */
269                         r       = _mm_mul_ps(rsq,rinv);
270                         rt      = _mm_mul_ps(r,gbscale);
271                         
272                         n0      = _mm_cvttps_epi32(rt);
273                         n0f     = _mm_cvtepi32_ps(n0);
274                         eps     = _mm_sub_ps(rt,n0f);
275                         
276                         eps2    = _mm_mul_ps(eps,eps);
277                         nnn     = _mm_slli_epi32(n0,2);
278                 
279                         /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
280                         xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
281                         xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
282                         xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
283                         xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
284                         
285                         /* transpose 4*4 */
286                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
287                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
288                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
289                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
290                         
291                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
292                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
293                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
294                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
295                         
296                         Geps    = _mm_mul_ps(G,eps);
297                         Heps2   = _mm_mul_ps(H,eps2);
298                         Fp      = _mm_add_ps(F,Geps);
299                         Fp      = _mm_add_ps(Fp,Heps2);
300                         
301                         VV      = _mm_mul_ps(Fp,eps);
302                         VV      = _mm_add_ps(Y,VV);
303                         
304                         xmm1    = _mm_mul_ps(two,Heps2);
305                         FF      = _mm_add_ps(Fp,Geps);
306                         FF      = _mm_add_ps(FF,xmm1);
307                         
308                         vgb     = _mm_mul_ps(qq,VV);
309                         fijC    = _mm_mul_ps(qq,FF);
310                         fijC    = _mm_mul_ps(fijC,gbscale);
311                         
312                         dvdatmp = _mm_mul_ps(fijC,r);
313                         dvdatmp = _mm_add_ps(vgb,dvdatmp);
314                         dvdatmp = _mm_mul_ps(half,dvdatmp);
315                         dvdatmp = _mm_mul_ps(neg,dvdatmp);
316                         
317                         dvdasum = _mm_add_ps(dvdasum,dvdatmp);
318                         
319                         xmm1    = _mm_mul_ps(dvdatmp,isaj);
320                         xmm1    = _mm_mul_ps(xmm1,isaj);
321                         dvdaj   = _mm_add_ps(dvdaj,xmm1);
322                         
323                         _mm_store_ss(dvda+jnr,dvdaj);
324                         xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
325                         _mm_store_ss(dvda+jnr2,xmm1);
326                         xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2)); 
327                         _mm_store_ss(dvda+jnr3,xmm1);
328                         xmm1    = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(2,1,0,3));
329                         _mm_store_ss(dvda+jnr4,xmm1);
330                         
331                         vctot   = _mm_add_ps(vctot,vcoul);
332                         vgbtot  = _mm_add_ps(vgbtot,vgb);
333                         
334                         /* Calculate VDW table index */
335                         rt      = _mm_mul_ps(r,tabscale_sse);
336                         n0      = _mm_cvttps_epi32(rt);
337                         n0f     = _mm_cvtepi32_ps(n0);
338                         eps     = _mm_sub_ps(rt,n0f);
339                         eps2    = _mm_mul_ps(eps,eps);
340                         nnn     = _mm_slli_epi32(n0,3);
341
342                         /* Tabulated VdW interaction - disperion */                     
343                         xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
344                         xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
345                         xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
346                         xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
347                         
348                         /* transpose 4*4 */
349                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
350                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
351                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
352                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
353                         
354                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
355                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
356                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
357                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
358                         
359                         Geps    = _mm_mul_ps(G,eps);
360                         Heps2   = _mm_mul_ps(H,eps2);
361                         Fp      = _mm_add_ps(F,Geps);
362                         Fp      = _mm_add_ps(Fp,Heps2);
363                         VV      = _mm_mul_ps(Fp,eps);
364                         VV      = _mm_add_ps(Y,VV);
365                         xmm1    = _mm_mul_ps(two,Heps2);
366                         FF      = _mm_add_ps(Fp,Geps);
367                         FF      = _mm_add_ps(FF,xmm1);
368
369                         Vvdw6   = _mm_mul_ps(c6,VV);
370                         fijD    = _mm_mul_ps(c6,FF);
371                         
372                         /* Tabulated VdW interaction - repulsion */
373                         nnn     = _mm_add_epi32(nnn,four);
374                         
375                         xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
376                         xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
377                         xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
378                         xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
379                         
380                         /* transpose 4*4 */
381                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
382                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
383                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
384                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
385                         
386                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
387                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
388                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
389                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
390                         
391                         Geps    = _mm_mul_ps(G,eps);
392                         Heps2   = _mm_mul_ps(H,eps2);
393                         Fp      = _mm_add_ps(F,Geps);
394                         Fp      = _mm_add_ps(Fp,Heps2);
395                         VV      = _mm_mul_ps(Fp,eps);
396                         VV      = _mm_add_ps(Y,VV);
397                         xmm1    = _mm_mul_ps(two,Heps2);
398                         FF      = _mm_add_ps(Fp,Geps);
399                         FF      = _mm_add_ps(FF,xmm1);
400                         
401                         Vvdw12  = _mm_mul_ps(c12,VV);
402                         fijR    = _mm_mul_ps(c12,FF);
403                         
404                         Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
405                         Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
406                         
407                         xmm1    = _mm_add_ps(fijD,fijR);
408                         xmm1    = _mm_mul_ps(xmm1,tabscale_sse);
409                         xmm1    = _mm_add_ps(xmm1,fijC);
410                         xmm1    = _mm_sub_ps(xmm1,fscal);
411                         fscal   = _mm_mul_ps(xmm1,neg);
412                         fscal   = _mm_mul_ps(fscal,rinv);
413         
414                         t1      = _mm_mul_ps(fscal,dx1);
415                         t2      = _mm_mul_ps(fscal,dy1);
416                         t3      = _mm_mul_ps(fscal,dz1);
417                         
418                         fix     = _mm_add_ps(fix,t1);
419                         fiy     = _mm_add_ps(fiy,t2);
420                         fiz     = _mm_add_ps(fiz,t3);
421                         
422                         xmm1    = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));  /* fx1 fy1 - - */
423                         xmm2    = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); /* fx1 fy1 - - */
424                         xmm3    = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33)); /* fx3 fy3 - - */
425                         xmm4    = _mm_loadh_pi(xmm4, (__m64 *) (faction+j43)); /* fx4 fy4 - - */
426                         
427                         xmm5    = _mm_load1_ps(faction+j3+2); 
428                         xmm6    = _mm_load1_ps(faction+j23+2);
429                         xmm7    = _mm_load1_ps(faction+j33+2);
430                         xmm8    = _mm_load1_ps(faction+j43+2);
431                         
432                         /* transpose the forces */
433                         xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
434                         xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
435                         xmm7    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
436                         
437                         xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
438                         xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
439                         
440                         xmm5    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
441                         xmm6    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
442                         
443                         /* subtract partial terms */
444                         xmm5    = _mm_sub_ps(xmm5,t1);
445                         xmm6    = _mm_sub_ps(xmm6,t2);
446                         xmm7    = _mm_sub_ps(xmm7,t3);
447                         
448                         xmm1    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0));
449                         xmm2    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2));
450                         xmm1    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0));
451                         xmm2    = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0));
452                         
453                         /* store fx's and fy's */
454                         _mm_storel_pi( (__m64 *) (faction+j3),xmm1);
455                         _mm_storeh_pi( (__m64 *) (faction+j23),xmm1);
456                         _mm_storel_pi( (__m64 *) (faction+j33),xmm2);
457                         _mm_storeh_pi( (__m64 *) (faction+j43),xmm2);
458                         
459                         /* ..then z */
460                         _mm_store_ss(faction+j3+2,xmm7);
461                         xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
462                         _mm_store_ss(faction+j23+2,xmm7);
463                         xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
464                         _mm_store_ss(faction+j33+2,xmm7);
465                         xmm7    = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
466                         _mm_store_ss(faction+j43+2,xmm7);       
467                         
468                 }
469         
470                 if(offset!=0)
471                 {
472                         if(offset==1)
473                         {
474                                 jnr   = jjnr[k];
475                                 j3    = jnr*3;
476                                 tj        = nti+2*type[jnr];
477                                 
478                                 xmm1  = _mm_loadl_pi(xmm1, (__m64 *) (pos+j3));
479                                 xmm5  = _mm_load1_ps(pos+j3+2);
480                                 
481                                 xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
482                                 xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
483                                 
484                                 isaj  = _mm_load_ss(invsqrta+jnr);
485                                 dvdaj = _mm_load_ss(dvda+jnr);
486                                 q     = _mm_load_ss(charge+jnr);
487                                 c6    = _mm_load_ss(vdwparam+tj);
488                                 c12   = _mm_load_ss(vdwparam+tj+1);
489                                 
490                                 mask  =  _mm_set_epi32(0,0,0,0xffffffff);
491                                 
492                         }
493                         else if(offset==2)
494                         {
495                                 jnr   = jjnr[k];
496                                 jnr2  = jjnr[k+1];
497                                 
498                                 j3    = jnr*3;
499                                 j23   = jnr2*3;
500                                 
501                                 tj        = nti+2*type[jnr];
502                                 tj2       = nti+2*type[jnr2];
503                                 
504                                 xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (pos+j3));
505                                 xmm2  = _mm_loadh_pi(xmm2, (__m64 *) (pos+j23));
506                                 
507                                 xmm5  = _mm_load1_ps(pos+j3+2);
508                                 xmm6  = _mm_load1_ps(pos+j23+2);
509                                 
510                                 xmm5  = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
511                                 xmm5  = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
512                                 
513                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
514                                 xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
515                                 xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
516                                 
517                                 xmm1  = _mm_load_ss(invsqrta+jnr);
518                                 xmm2  = _mm_load_ss(invsqrta+jnr2);
519                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
520                                 isaj  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
521                                 
522                                 xmm1  = _mm_load_ss(dvda+jnr);
523                                 xmm2  = _mm_load_ss(dvda+jnr2);
524                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
525                                 dvdaj = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
526                                 
527                                 xmm1  = _mm_load_ss(charge+jnr);
528                                 xmm2  = _mm_load_ss(charge+jnr2);
529                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
530                                 q     = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
531                                 
532                                 xmm1  = _mm_loadl_pi(xmm1,(__m64 *) (vdwparam+tj));
533                                 xmm1  = _mm_loadh_pi(xmm1,(__m64 *) (vdwparam+tj2));
534                                 
535                                 c6    = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
536                                 c12   = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
537                                 
538                                 mask  = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
539                         }
540                         else
541                         {
542                                 jnr   = jjnr[k];
543                                 jnr2  = jjnr[k+1];
544                                 jnr3  = jjnr[k+2];
545                                 
546                                 j3    = jnr*3;
547                                 j23   = jnr2*3;
548                                 j33   = jnr3*3;
549                                 
550                                 tj        = nti+2*type[jnr];
551                                 tj2       = nti+2*type[jnr2];
552                                 tj3       = nti+2*type[jnr3];
553                                 
554                                 xmm1  = _mm_loadh_pi(xmm1,(__m64 *) (pos+j3)); 
555                                 xmm2  = _mm_loadh_pi(xmm2,(__m64 *) (pos+j23)); 
556                                 xmm3  = _mm_loadh_pi(xmm3,(__m64 *) (pos+j33)); 
557                                 
558                                 xmm5  = _mm_load1_ps(pos+j3+2); 
559                                 xmm6  = _mm_load1_ps(pos+j23+2); 
560                                 xmm7  = _mm_load1_ps(pos+j33+2); 
561                                 
562                                 xmm5  = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); 
563                                 xmm5  = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));        
564                                 
565                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); 
566                                 xmm2  = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2)); 
567                                 
568                                 xmm6  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); 
569                                 xmm4  = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); 
570                                 
571                                 xmm1  = _mm_load_ss(invsqrta+jnr);
572                                 xmm2  = _mm_load_ss(invsqrta+jnr2);
573                                 xmm3  = _mm_load_ss(invsqrta+jnr3);
574                                 
575                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
576                                 xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
577                                 isaj  = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
578                                 
579                                 xmm1  = _mm_load_ss(dvda+jnr);
580                                 xmm2  = _mm_load_ss(dvda+jnr2);
581                                 xmm3  = _mm_load_ss(dvda+jnr3);
582                                 
583                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
584                                 xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
585                                 dvdaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
586                                 
587                                 xmm1  = _mm_load_ss(charge+jnr);
588                                 xmm2  = _mm_load_ss(charge+jnr2);
589                                 xmm3  = _mm_load_ss(charge+jnr3);
590                                 
591                                 xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); 
592                                 xmm3  = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); 
593                                 q     = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
594                                 
595                                 xmm1  = _mm_loadl_pi(xmm1, (__m64 *) (vdwparam+tj)); 
596                                 xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (vdwparam+tj2));
597                                 xmm2  = _mm_loadl_pi(xmm2, (__m64 *) (vdwparam+tj3));
598                                 
599                                 xmm3  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,2,0,2));
600                                 c6    = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,0,0,1));
601                                 
602                                 xmm3  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
603                                 c12   = _mm_shuffle_ps(xmm3,xmm2,_MM_SHUFFLE(1,1,1,0));
604                                 
605                                 mask  = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
606                         }
607                         
608                         jx      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm6);
609                         jy      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm4);
610                         jz      = _mm_and_ps( gmx_mm_castsi128_ps(mask), xmm5);
611                         
612                         c6      = _mm_and_ps( gmx_mm_castsi128_ps(mask), c6);
613                         c12     = _mm_and_ps( gmx_mm_castsi128_ps(mask), c12);
614                         dvdaj   = _mm_and_ps( gmx_mm_castsi128_ps(mask), dvdaj);
615                         isaj    = _mm_and_ps( gmx_mm_castsi128_ps(mask), isaj);                 
616                         q       = _mm_and_ps( gmx_mm_castsi128_ps(mask), q);
617                         
618                         dx1     = _mm_sub_ps(ix,jx);
619                         dy1     = _mm_sub_ps(iy,jy);
620                         dz1     = _mm_sub_ps(iz,jz);
621                         
622                         t1      = _mm_mul_ps(dx1,dx1);
623                         t2      = _mm_mul_ps(dy1,dy1);
624                         t3      = _mm_mul_ps(dz1,dz1);
625                         
626                         rsq     = _mm_add_ps(t1,t2);
627                         rsq     = _mm_add_ps(rsq,t3);
628                         
629                         rinv    = gmx_mm_invsqrt_ps(rsq);
630                         
631                         isaprod = _mm_mul_ps(isai,isaj);
632                         qq      = _mm_mul_ps(iq,q);
633                         vcoul   = _mm_mul_ps(qq,rinv);
634                         fscal   = _mm_mul_ps(vcoul,rinv);
635                         
636                         qq      = _mm_mul_ps(qq,neg);
637                         qq      = _mm_mul_ps(isaprod,qq);
638                         
639                         gbscale = _mm_mul_ps(isaprod,gbtabscale_sse);
640                         rinvsq  = _mm_mul_ps(rinv,rinv);
641                         r       = _mm_mul_ps(rsq,rinv);
642                         rt      = _mm_mul_ps(r,gbscale);
643                         
644                         n0      = _mm_cvttps_epi32(rt);
645                         n0f     = _mm_cvtepi32_ps(n0);
646                         eps     = _mm_sub_ps(rt,n0f);
647                         
648                         eps2    = _mm_mul_ps(eps,eps);
649                         nnn     = _mm_slli_epi32(n0,2);
650                         
651                         /* the tables are 16-byte aligned, so we can use _mm_load_ps */                 
652                         xmm1    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
653                         xmm2    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
654                         xmm3    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
655                         xmm4    = _mm_load_ps(GBtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
656                         
657                         /* transpose 4*4 */
658                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
659                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
660                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
661                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
662                         
663                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
664                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); 
665                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); 
666                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); 
667                         
668                         Geps = _mm_mul_ps(G,eps);
669                         Heps2   = _mm_mul_ps(H,eps2);
670                         Fp      = _mm_add_ps(F,Geps);
671                         Fp      = _mm_add_ps(Fp,Heps2);
672                         VV      = _mm_mul_ps(Fp,eps);
673                         VV      = _mm_add_ps(Y,VV);
674                         
675                         xmm1    = _mm_mul_ps(two,Heps2);
676                         FF      = _mm_add_ps(Fp,Geps);
677                         FF      = _mm_add_ps(FF,xmm1);
678                         vgb     = _mm_mul_ps(qq,VV);
679                         fijC    = _mm_mul_ps(qq,FF);
680                         fijC    = _mm_mul_ps(fijC,gbscale);
681                         
682                         dvdatmp = _mm_mul_ps(fijC,r);
683                         dvdatmp = _mm_add_ps(vgb,dvdatmp);
684                         dvdatmp = _mm_mul_ps(half,dvdatmp);
685                         dvdatmp = _mm_mul_ps(neg,dvdatmp);
686                         
687                         dvdasum = _mm_add_ps(dvdasum,dvdatmp);
688                         
689                         xmm1    = _mm_mul_ps(dvdatmp,isaj);
690                         xmm1    = _mm_mul_ps(xmm1,isaj);
691                         dvdaj   = _mm_add_ps(dvdaj,xmm1);
692                         
693                         vcoul   = _mm_and_ps( gmx_mm_castsi128_ps(mask), vcoul);
694                         vgb     = _mm_and_ps( gmx_mm_castsi128_ps(mask), vgb);
695                         
696                         vctot   = _mm_add_ps(vctot,vcoul);
697                         vgbtot  = _mm_add_ps(vgbtot,vgb);
698                                                 
699                         /* Calculate VDW table index */
700                         rt      = _mm_mul_ps(r,tabscale_sse);
701                         n0      = _mm_cvttps_epi32(rt);
702                         n0f     = _mm_cvtepi32_ps(n0);
703                         eps     = _mm_sub_ps(rt,n0f);
704                         eps2    = _mm_mul_ps(eps,eps);
705                         nnn     = _mm_slli_epi32(n0,3);
706                         
707                         /* Tabulated VdW interaction - disperion */     
708                         xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
709                         xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
710                         xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
711                         xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
712                 
713                         /* transpose 4*4 */
714                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
715                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
716                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
717                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
718                         
719                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
720                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
721                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
722                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
723                         
724                         Geps    = _mm_mul_ps(G,eps);
725                         Heps2   = _mm_mul_ps(H,eps2);
726                         Fp      = _mm_add_ps(F,Geps);
727                         Fp      = _mm_add_ps(Fp,Heps2);
728                         VV      = _mm_mul_ps(Fp,eps);
729                         VV      = _mm_add_ps(Y,VV);
730                         xmm1    = _mm_mul_ps(two,Heps2);
731                         FF      = _mm_add_ps(Fp,Geps);
732                         FF      = _mm_add_ps(FF,xmm1);
733                         
734                         Vvdw6   = _mm_mul_ps(c6,VV);
735                         fijD    = _mm_mul_ps(c6,FF);
736                         
737                         /* Tabulated VdW interaction - repulsion */
738                         nnn     = _mm_add_epi32(nnn,four);
739                                         
740                         xmm1    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,0)));  /* Y1,F1,G1,H1 */
741                         xmm2    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,1)));  /* Y2,F2,G2,H2 */
742                         xmm3    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,2)));  /* Y3,F3,G3,H3 */
743                         xmm4    = _mm_load_ps(VFtab+(gmx_mm_extract_epi32(nnn,3)));  /* Y4,F4,G4,H4 */
744                         
745                         /* transpose 4*4 */
746                         xmm5    = _mm_unpacklo_ps(xmm1,xmm2); /* Y1,Y2,F1,F2 */
747                         xmm6    = _mm_unpacklo_ps(xmm3,xmm4); /* Y3,Y4,F3,F4 */
748                         xmm7    = _mm_unpackhi_ps(xmm1,xmm2); /* G1,G2,H1,H2 */
749                         xmm8    = _mm_unpackhi_ps(xmm3,xmm4); /* G3,G4,H3,H4 */
750                         
751                         Y       = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /* Y1 Y2 Y3 Y4 */
752                         F               = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /* F1 F2 F3 F4 */
753                         G               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(1,0,1,0)); /* G1 G2 G3 G4 */
754                         H               = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(3,2,3,2)); /* H1 H2 H3 H4 */
755                         
756                         Geps    = _mm_mul_ps(G,eps);
757                         Heps2   = _mm_mul_ps(H,eps2);
758                         Fp      = _mm_add_ps(F,Geps);
759                         Fp      = _mm_add_ps(Fp,Heps2);
760                         VV      = _mm_mul_ps(Fp,eps);
761                         VV      = _mm_add_ps(Y,VV);
762                         xmm1    = _mm_mul_ps(two,Heps2);
763                         FF      = _mm_add_ps(Fp,Geps);
764                         FF      = _mm_add_ps(FF,xmm1);
765                         
766                         Vvdw12  = _mm_mul_ps(c12,VV);
767                         fijR    = _mm_mul_ps(c12,FF);
768                         
769                         Vvdwtmp = _mm_add_ps(Vvdw12,Vvdw6);
770                         Vvdwtot = _mm_add_ps(Vvdwtot,Vvdwtmp);
771                         
772                         xmm1    = _mm_add_ps(fijD,fijR);
773                         xmm1    = _mm_mul_ps(xmm1,tabscale_sse);
774                         xmm1    = _mm_add_ps(xmm1,fijC);
775                         xmm1    = _mm_sub_ps(xmm1,fscal);
776                         fscal   = _mm_mul_ps(xmm1,neg);
777                         fscal   = _mm_mul_ps(fscal,rinv);                       
778                         
779                         t1      = _mm_mul_ps(fscal,dx1);
780                         t2      = _mm_mul_ps(fscal,dy1);
781                         t3      = _mm_mul_ps(fscal,dz1);
782                         
783                         if(offset==1)
784                         {
785                                 _mm_store_ss(dvda+jnr,dvdaj);
786                                 
787                                 xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (faction+j3));
788                                 xmm7 = _mm_load1_ps(faction+j3+2);
789                                 
790                                 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
791                                 xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(1,1,1,1));
792                                 
793                                 xmm5 = _mm_sub_ps(xmm5,t1);
794                                 xmm6 = _mm_sub_ps(xmm6,t2);
795                                 xmm7 = _mm_sub_ps(xmm7,t3);
796                                 
797                                 _mm_store_ss(faction+j3 , xmm5);
798                                 _mm_store_ss(faction+j3+1,xmm6);
799                                 _mm_store_ss(faction+j3+2,xmm7);
800                         }
801                         else if(offset==2)
802                         {
803                                 _mm_store_ss(dvda+jnr,dvdaj);
804                                 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
805                                 _mm_store_ss(dvda+jnr2,xmm1);
806                                 
807                                 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3));
808                                 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23)); 
809                                 
810                                 xmm5 = _mm_load1_ps(faction+j3+2); 
811                                 xmm6 = _mm_load1_ps(faction+j23+2);
812                                 
813                                 xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); 
814                                 xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); 
815                                 
816                                 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); 
817                                 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); 
818                                 xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); 
819                                 
820                                 xmm5 = _mm_sub_ps(xmm5, t1);
821                                 xmm6 = _mm_sub_ps(xmm6, t2);
822                                 xmm7 = _mm_sub_ps(xmm7, t3);
823                                 
824                                 xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
825                                 xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); 
826                                 
827                                 _mm_storel_pi( (__m64 *) (faction+j3), xmm5);
828                                 _mm_storeh_pi( (__m64 *) (faction+j23), xmm5);
829                                 
830                                 _mm_store_ss(faction+j3+2,xmm7);
831                                 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
832                                 _mm_store_ss(faction+j23+2,xmm7);
833                         }
834                         else
835                         {
836                                 _mm_store_ss(dvda+jnr,dvdaj);
837                                 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(0,3,2,1)); 
838                                 _mm_store_ss(dvda+jnr2,xmm1);
839                                 xmm1 = _mm_shuffle_ps(dvdaj,dvdaj,_MM_SHUFFLE(1,0,3,2)); 
840                                 _mm_store_ss(dvda+jnr3,xmm1);
841                                 
842                                 xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (faction+j3)); 
843                                 xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (faction+j23));  
844                                 xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (faction+j33));
845                                 
846                                 xmm5 = _mm_load1_ps(faction+j3+2);
847                                 xmm6 = _mm_load1_ps(faction+j23+2); 
848                                 xmm7 = _mm_load1_ps(faction+j33+2); 
849                                 
850                                 xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); 
851                                 xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0)); 
852                                 xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); 
853                                 
854                                 xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); 
855                                 xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2)); 
856                                 
857                                 xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); 
858                                 xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); 
859                                 
860                                 xmm5 = _mm_sub_ps(xmm5, t1);
861                                 xmm6 = _mm_sub_ps(xmm6, t2);
862                                 xmm7 = _mm_sub_ps(xmm7, t3);
863                                 
864                                 xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); 
865                                 xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); 
866                                 xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); 
867                                 xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); 
868                                 
869                                 _mm_storel_pi( (__m64 *) (faction+j3), xmm1);
870                                 _mm_storeh_pi( (__m64 *) (faction+j23), xmm1);
871                                 _mm_storel_pi( (__m64 *) (faction+j33), xmm2);
872                                 
873                                 _mm_store_ss(faction+j3+2,xmm7); 
874                                 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
875                                 _mm_store_ss(faction+j23+2,xmm7); 
876                                 xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
877                                 _mm_store_ss(faction+j33+2,xmm7); 
878                         }
879                         
880                         t1 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t1);
881                         t2 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t2);
882                         t3 = _mm_and_ps( gmx_mm_castsi128_ps(mask), t3);
883                         
884                         fix = _mm_add_ps(fix,t1);
885                         fiy = _mm_add_ps(fiy,t2);
886                         fiz = _mm_add_ps(fiz,t3);
887                 }
888                 
889                 t1      = _mm_movehl_ps(t1,fix);
890                 t2      = _mm_movehl_ps(t2,fiy);
891                 t3      = _mm_movehl_ps(t3,fiz);
892                 
893                 fix     = _mm_add_ps(fix,t1);
894                 fiy     = _mm_add_ps(fiy,t2);
895                 fiz     = _mm_add_ps(fiz,t3);
896                 
897                 t1      = _mm_shuffle_ps(fix,fix,_MM_SHUFFLE(1,1,1,1));
898                 t2      = _mm_shuffle_ps(fiy,fiy,_MM_SHUFFLE(1,1,1,1));
899                 t3      = _mm_shuffle_ps(fiz,fiz,_MM_SHUFFLE(1,1,1,1));
900                 
901                 fix     = _mm_add_ps(fix,t1);
902                 fiy     = _mm_add_ps(fiy,t2);
903                 fiz     = _mm_add_ps(fiz,t3);
904                 
905                 xmm2    = _mm_unpacklo_ps(fix,fiy); /* fx, fy, - - */
906                 xmm2    = _mm_movelh_ps(xmm2,fiz); 
907                 xmm2    = _mm_and_ps( gmx_mm_castsi128_ps(maski), xmm2);
908                 
909                 /* load i force from memory */
910                 xmm4    = _mm_loadl_pi(xmm4, (__m64 *) (faction+ii3));
911                 xmm5    = _mm_load1_ps(faction+ii3+2);
912                 xmm4    = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
913                 
914                 /* add to i force */
915                 xmm4    = _mm_add_ps(xmm4,xmm2);
916                 
917                 /* store i force to memory */
918                 _mm_storel_pi( (__m64 *) (faction+ii3),xmm4);
919                 xmm4    = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
920                 _mm_store_ss(faction+ii3+2,xmm4);
921                 
922     /* Load, add and store i shift forces */
923     xmm4 = _mm_loadl_pi(xmm4, (__m64 *) (fshift+is3));
924     xmm5 = _mm_load1_ps(fshift+is3+2);
925     xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0));
926       
927     xmm4 = _mm_add_ps(xmm4,xmm2);
928       
929     _mm_storel_pi( (__m64 *) (fshift+is3),xmm4);
930     xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2));
931     _mm_store_ss(fshift+is3+2,xmm4);
932       
933     /* Coulomb potential */
934     ggid             = gid[n];         
935                 
936                 vcoul   = _mm_movehl_ps(vcoul,vctot);
937                 vctot   = _mm_add_ps(vctot,vcoul);
938                 vcoul   = _mm_shuffle_ps(vctot,vctot,_MM_SHUFFLE(1,1,1,1));
939                 vctot   = _mm_add_ss(vctot,vcoul);
940                 
941                 _mm_store_ss(&vct,vctot);
942                 Vc[ggid] = Vc[ggid] + vct;
943                 
944                 Vvdwtmp  = _mm_movehl_ps(Vvdwtmp,Vvdwtot);
945                 Vvdwtot  = _mm_add_ps(Vvdwtot,Vvdwtmp);
946                 Vvdwtmp  = _mm_shuffle_ps(Vvdwtot,Vvdwtot,_MM_SHUFFLE(1,1,1,1));
947                 Vvdwtot  = _mm_add_ss(Vvdwtot,Vvdwtmp);
948                 
949                 _mm_store_ss(&vdwt,Vvdwtot);
950                 Vvdw[ggid] = Vvdw[ggid] + vdwt;
951                 
952                 /* dvda */
953                 dvdatmp = _mm_movehl_ps(dvdatmp,dvdasum);
954                 dvdasum = _mm_add_ps(dvdasum,dvdatmp);
955                 dvdatmp = _mm_shuffle_ps(dvdasum,dvdasum,_MM_SHUFFLE(1,1,1,1));
956                 dvdasum = _mm_add_ss(dvdasum,dvdatmp);
957                 
958                 _mm_store_ss(&dva,dvdasum);
959                 dvda[ii] = dvda[ii] + dva*isai_f*isai_f;
960                 
961                 /* Store the GB potential to the work array */
962                 vgb     = _mm_movehl_ps(vgb,vgbtot);
963                 vgbtot  = _mm_add_ps(vgbtot,vgb);
964                 vgb     = _mm_shuffle_ps(vgbtot,vgbtot,_MM_SHUFFLE(1,1,1,1));
965                 vgbtot  = _mm_add_ss(vgbtot,vgb);
966                 
967                 _mm_store_ss(&vgbt,vgbtot);
968                 gpol[ggid] = gpol[ggid] + vgbt;
969     }
970         
971         *outeriter       = nri;            
972     *inneriter       = nj1;            
973 }
974
975
976 /*
977  * Gromacs nonbonded kernel nb_kernel430nf
978  * Coulomb interaction:     Generalized-Born
979  * VdW interaction:         Tabulated
980  * water optimization:      No
981  * Calculate forces:        no
982  */
983 void nb_kernel430nf_x86_64_sse(
984                     int *           p_nri,
985                     int *           iinr,
986                     int *           jindex,
987                     int *           jjnr,
988                     int *           shift,
989                     float *         shiftvec,
990                     float *         fshift,
991                     int *           gid,
992                     float *         pos,
993                     float *         faction,
994                     float *         charge,
995                     float *         p_facel,
996                     float *         p_krf,
997                     float *         p_crf,
998                     float *         Vc,
999                     int *           type,
1000                     int *           p_ntype,
1001                     float *         vdwparam,
1002                     float *         Vvdw,
1003                     float *         p_tabscale,
1004                     float *         VFtab,
1005                     float *         invsqrta,
1006                     float *         dvda,
1007                     float *         p_gbtabscale,
1008                     float *         GBtab,
1009                     int *           p_nthreads,
1010                     int *           count,
1011                     void *          mtx,
1012                     int *           outeriter,
1013                     int *           inneriter,
1014                     float *         work)
1015 {
1016     int           nri,ntype,nthreads;
1017     float         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
1018     int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
1019     float         shX,shY,shZ;
1020     float         iq;
1021     float         qq,vcoul,vctot;
1022     int           nti;
1023     int           tj;
1024     float         Vvdw6,Vvdwtot;
1025     float         Vvdw12;
1026     float         r,rt,eps,eps2;
1027     int           n0,nnn;
1028     float         Y,F,Geps,Heps2,Fp,VV;
1029     float         isai,isaj,isaprod,gbscale;
1030     float         ix1,iy1,iz1;
1031     float         jx1,jy1,jz1;
1032     float         dx11,dy11,dz11,rsq11,rinv11;
1033     float         c6,c12;
1034
1035     nri              = *p_nri;         
1036     ntype            = *p_ntype;       
1037     nthreads         = *p_nthreads;    
1038     facel            = *p_facel;       
1039     krf              = *p_krf;         
1040     crf              = *p_crf;         
1041     tabscale         = *p_tabscale;    
1042     gbtabscale       = *p_gbtabscale;  
1043     nj1              = 0;              
1044     
1045     for(n=0; (n<nri); n++)
1046     {
1047         is3              = 3*shift[n];     
1048         shX              = shiftvec[is3];  
1049         shY              = shiftvec[is3+1];
1050         shZ              = shiftvec[is3+2];
1051         nj0              = jindex[n];      
1052         nj1              = jindex[n+1];    
1053         ii               = iinr[n];        
1054         ii3              = 3*ii;           
1055         ix1              = shX + pos[ii3+0];
1056         iy1              = shY + pos[ii3+1];
1057         iz1              = shZ + pos[ii3+2];
1058         iq               = facel*charge[ii];
1059         isai             = invsqrta[ii];   
1060         nti              = 2*ntype*type[ii];
1061         vctot            = 0;              
1062         Vvdwtot          = 0;              
1063         
1064         for(k=nj0; (k<nj1); k++)
1065         {
1066             jnr              = jjnr[k];        
1067             j3               = 3*jnr;          
1068             jx1              = pos[j3+0];      
1069             jy1              = pos[j3+1];      
1070             jz1              = pos[j3+2];      
1071             dx11             = ix1 - jx1;      
1072             dy11             = iy1 - jy1;      
1073             dz11             = iz1 - jz1;      
1074             rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
1075             rinv11           = gmx_invsqrt(rsq11);
1076             isaj             = invsqrta[jnr];  
1077             isaprod          = isai*isaj;      
1078             qq               = iq*charge[jnr]; 
1079             vcoul            = qq*rinv11;      
1080             qq               = isaprod*(-qq);  
1081             gbscale          = isaprod*gbtabscale;
1082             tj               = nti+2*type[jnr];
1083             c6               = vdwparam[tj];   
1084             c12              = vdwparam[tj+1]; 
1085             r                = rsq11*rinv11;   
1086             rt               = r*gbscale;      
1087             n0               = rt;             
1088             eps              = rt-n0;          
1089             eps2             = eps*eps;        
1090             nnn              = 4*n0;           
1091             Y                = GBtab[nnn];     
1092             F                = GBtab[nnn+1];   
1093             Geps             = eps*GBtab[nnn+2];
1094             Heps2            = eps2*GBtab[nnn+3];
1095             Fp               = F+Geps+Heps2;   
1096             VV               = Y+eps*Fp;       
1097             vgb              = qq*VV;          
1098             vctot            = vctot + vcoul;  
1099             r                = rsq11*rinv11;   
1100             rt               = r*tabscale;     
1101             n0               = rt;             
1102             eps              = rt-n0;          
1103             eps2             = eps*eps;        
1104             nnn              = 8*n0;           
1105             Y                = VFtab[nnn];     
1106             F                = VFtab[nnn+1];   
1107             Geps             = eps*VFtab[nnn+2];
1108             Heps2            = eps2*VFtab[nnn+3];
1109             Fp               = F+Geps+Heps2;   
1110             VV               = Y+eps*Fp;       
1111             Vvdw6            = c6*VV;          
1112             nnn              = nnn+4;          
1113             Y                = VFtab[nnn];     
1114             F                = VFtab[nnn+1];   
1115             Geps             = eps*VFtab[nnn+2];
1116             Heps2            = eps2*VFtab[nnn+3];
1117             Fp               = F+Geps+Heps2;   
1118             VV               = Y+eps*Fp;       
1119             Vvdw12           = c12*VV;         
1120             Vvdwtot          = Vvdwtot+ Vvdw6 + Vvdw12;
1121         }
1122         
1123         ggid             = gid[n];         
1124         Vc[ggid]         = Vc[ggid] + vctot;
1125         Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
1126     }
1127     
1128     *outeriter       = nri;            
1129     *inneriter       = nj1;            
1130 }
1131
1132