Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel410_sse2_single.c
1 /*
2  * Copyright (c) Erik Lindahl, David van der Spoel 2003
3  * 
4  * This file is generated automatically at compile time
5  * by the program mknb in the Gromacs distribution.
6  *
7  * Options used when generation this file:
8  * Language:         c
9  * Precision:        single
10  * Threads:          no
11  * Software invsqrt: yes
12  * Prefetch forces:  no
13  * Comments:         no
14  */
15 #ifdef HAVE_CONFIG_H
16 #include<config.h>
17 #endif
18 #include<math.h>
19 #include<vec.h>
20
21 #include <emmintrin.h>
22
23 #include <gmx_sse2_single.h>
24
25 /* get gmx_gbdata_t */
26 #include "../nb_kerneltype.h"
27
28
29 void nb_kernel410_sse2_single(int *           p_nri,
30                     int *           iinr,
31                     int *           jindex,
32                     int *           jjnr,
33                     int *           shift,
34                     float *         shiftvec,
35                     float *         fshift,
36                     int *           gid,
37                     float *         pos,
38                     float *         faction,
39                     float *         charge,
40                     float *         p_facel,
41                     float *         p_krf,
42                     float *         p_crf,
43                     float *         vc,
44                     int *           type,
45                     int *           p_ntype,
46                     float *         vdwparam,
47                     float *         vvdw,
48                     float *         p_tabscale,
49                     float *         VFtab,
50                     float *         invsqrta,
51                     float *         dvda,
52                     float *         p_gbtabscale,
53                     float *         GBtab,
54                     int *           p_nthreads,
55                     int *           count,
56                     void *          mtx,
57                     int *           outeriter,
58                     int *           inneriter,
59                     float *         work)
60 {
61     int           nri,ntype,nthreads;
62     int           n,ii,is3,ii3,k,nj0,nj1,ggid;
63     float         shX,shY,shZ;
64         int                       offset,nti;
65     int           jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
66     int           jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
67         int           tjA,tjB,tjC,tjD;
68         int           tjE,tjF,tjG,tjH;
69         gmx_gbdata_t *gbdata;
70         float *        gpol;
71         
72         __m128   iq,qq,qqB,jq,jqB,isai;
73         __m128   ix,iy,iz;
74         __m128   jx,jy,jz;
75         __m128   jxB,jyB,jzB;
76         __m128   dx,dy,dz;
77         __m128   dxB,dyB,dzB;
78         __m128   vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
79         __m128   fix,fiy,fiz,tx,ty,tz,rsq;
80         __m128   fixB,fiyB,fizB,txB,tyB,tzB,rsqB;
81         __m128   rinv,isaj,isaprod;
82         __m128   rinvB,isajB,isaprodB;
83         __m128   vcoul,vcoulB,fscal,fscalB,gbscale,gbscaleB,c6,c6B,c12,c12B;
84         __m128   rinvsq,r,rtab;
85         __m128   rinvsqB,rB,rtabB;
86         __m128   eps,Y,F,G,H;
87         __m128   epsB,YB,FB,GB,HB;
88         __m128   vgb,vgbB,fijGB,fijGBB,dvdatmp,dvdatmpB;
89         __m128   rinvsix,vvdw6,vvdw12;
90         __m128   rinvsixB,vvdw6B,vvdw12B;
91         __m128   facel,gbtabscale,mask,dvdaj;
92     
93     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
94         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
95         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
96     
97         __m128i  n0, nnn;
98         __m128i  n0B, nnnB;
99         
100         const __m128 neg        = _mm_set1_ps(-1.0f);
101         const __m128 zero       = _mm_set1_ps(0.0f);
102         const __m128 minushalf  = _mm_set1_ps(-0.5f);
103         const __m128 two        = _mm_set1_ps(2.0f);
104         const __m128 six        = _mm_set1_ps(6.0f);
105         const __m128 twelve     = _mm_set1_ps(12.0f);
106
107         gbdata          = (gmx_gbdata_t *)work;
108         gpol            = gbdata->gpol;
109         
110         nri              = *p_nri;         
111     ntype            = *p_ntype;       
112     
113     gbfactor         = _mm_set1_ps( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
114     gbtabscale       = _mm_load1_ps(p_gbtabscale);  
115     facel            = _mm_load1_ps(p_facel);
116
117     nj1              = 0;
118     jnrA = jnrB = jnrC = jnrD = 0;
119     j3A = j3B = j3C = j3D = 0;
120     jx               = _mm_setzero_ps();
121     jy               = _mm_setzero_ps();
122     jz               = _mm_setzero_ps();
123     c6               = _mm_setzero_ps();
124     c12              = _mm_setzero_ps();
125
126     
127     for(n=0; (n<nri); n++)
128     {
129         is3              = 3*shift[n];     
130         shX              = shiftvec[is3];  
131         shY              = shiftvec[is3+1];
132         shZ              = shiftvec[is3+2];
133         nj0              = jindex[n];      
134         nj1              = jindex[n+1];    
135         ii               = iinr[n];        
136         ii3              = 3*ii;           
137         
138                 ix               = _mm_set1_ps(shX+pos[ii3+0]);
139                 iy               = _mm_set1_ps(shY+pos[ii3+1]);
140                 iz               = _mm_set1_ps(shZ+pos[ii3+2]);
141         
142                 iq               = _mm_load1_ps(charge+ii);
143                 iq               = _mm_mul_ps(iq,facel);
144                         
145                 isai             = _mm_load1_ps(invsqrta+ii);
146                                                                 
147                 nti              = 2*ntype*type[ii];
148                 
149                 vctot            = _mm_setzero_ps();
150                 vvdwtot          = _mm_setzero_ps();
151                 vgbtot           = _mm_setzero_ps();
152                 dvdasum          = _mm_setzero_ps();
153                 fix              = _mm_setzero_ps();
154                 fiy              = _mm_setzero_ps();
155                 fiz              = _mm_setzero_ps();
156        
157         for(k=nj0; k<nj1-7; k+=8)
158                 {
159                         jnrA        = jjnr[k];   
160                         jnrB        = jjnr[k+1];
161                         jnrC        = jjnr[k+2];
162                         jnrD        = jjnr[k+3];
163                         jnrE        = jjnr[k+4];   
164                         jnrF        = jjnr[k+5];
165                         jnrG        = jjnr[k+6];
166                         jnrH        = jjnr[k+7];
167             
168             j3A         = 3*jnrA;  
169                         j3B         = 3*jnrB;
170                         j3C         = 3*jnrC;
171                         j3D         = 3*jnrD;
172             j3E         = 3*jnrE;  
173                         j3F         = 3*jnrF;
174                         j3G         = 3*jnrG;
175                         j3H         = 3*jnrH;
176             
177
178             GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
179             GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3E,pos+j3F,pos+j3G,pos+j3H,jxB,jyB,jzB);
180
181                         dx           = _mm_sub_ps(ix,jx);
182                         dy           = _mm_sub_ps(iy,jy);
183                         dz           = _mm_sub_ps(iz,jz);
184                         dxB          = _mm_sub_ps(ix,jxB);
185                         dyB          = _mm_sub_ps(iy,jyB);
186                         dzB          = _mm_sub_ps(iz,jzB);
187                         
188                         rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
189                         rsqB         = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
190                         
191                         /* invsqrt */                                                           
192                         rinv         = gmx_mm_invsqrt_ps(rsq);
193                         rinvB        = gmx_mm_invsqrt_ps(rsqB);
194                         rinvsq       = _mm_mul_ps(rinv,rinv);
195                         rinvsqB      = _mm_mul_ps(rinvB,rinvB);
196
197                         /***********************************/
198                         /* INTERACTION SECTION STARTS HERE */
199                         /***********************************/
200                         GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
201                         GMX_MM_LOAD_4VALUES_PS(charge+jnrE,charge+jnrF,charge+jnrG,charge+jnrH,jqB);
202                         GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
203                         GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrE,invsqrta+jnrF,invsqrta+jnrG,invsqrta+jnrH,isajB);
204
205             /* Lennard-Jones */
206             tjA          = nti+2*type[jnrA];
207                         tjB          = nti+2*type[jnrB];
208                         tjC          = nti+2*type[jnrC];
209                         tjD          = nti+2*type[jnrD];
210             tjE          = nti+2*type[jnrE];
211                         tjF          = nti+2*type[jnrF];
212                         tjG          = nti+2*type[jnrG];
213                         tjH          = nti+2*type[jnrH];
214             
215             GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
216             GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjE,vdwparam+tjF,vdwparam+tjG,vdwparam+tjH,c6B,c12B);
217                         
218                         isaprod      = _mm_mul_ps(isai,isaj);
219                         isaprodB     = _mm_mul_ps(isai,isajB);
220                         qq           = _mm_mul_ps(iq,jq);            
221                         qqB          = _mm_mul_ps(iq,jqB);            
222                         vcoul        = _mm_mul_ps(qq,rinv);
223                         vcoulB       = _mm_mul_ps(qqB,rinvB);
224                         fscal        = _mm_mul_ps(vcoul,rinv);                                 
225                         fscalB       = _mm_mul_ps(vcoulB,rinvB);                                 
226             vctot        = _mm_add_ps(vctot,vcoul);
227             vctot        = _mm_add_ps(vctot,vcoulB);
228             
229             /* Polarization interaction */
230                         qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
231                         qqB          = _mm_mul_ps(qqB,_mm_mul_ps(isaprodB,gbfactor));
232                         gbscale      = _mm_mul_ps(isaprod,gbtabscale);
233                         gbscaleB     = _mm_mul_ps(isaprodB,gbtabscale);
234
235                         /* Calculate GB table index */
236                         r            = _mm_mul_ps(rsq,rinv);
237                         rB           = _mm_mul_ps(rsqB,rinvB);
238                         rtab         = _mm_mul_ps(r,gbscale);
239                         rtabB        = _mm_mul_ps(rB,gbscaleB);
240                         
241                         n0           = _mm_cvttps_epi32(rtab);
242                         n0B          = _mm_cvttps_epi32(rtabB);
243             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
244             epsB         = _mm_sub_ps(rtabB , _mm_cvtepi32_ps(n0B) );
245                         nnn          = _mm_slli_epi32(n0,2);
246                         nnnB         = _mm_slli_epi32(n0B,2);
247                         
248             /* the tables are 16-byte aligned, so we can use _mm_load_ps */                     
249             Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
250             F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
251             G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
252             H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
253             _MM_TRANSPOSE4_PS(Y,F,G,H);
254             YB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,0)); /* YFGH */
255             FB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,1)); /* YFGH */
256             GB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,2)); /* YFGH */
257             HB           = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnnB,3)); /* YFGH */
258             _MM_TRANSPOSE4_PS(YB,FB,GB,HB);
259             
260             G       = _mm_mul_ps(G,eps);
261             H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
262             F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
263             Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
264             F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
265             vgb     = _mm_mul_ps(Y, qq);           
266             fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
267
268             GB      = _mm_mul_ps(GB,epsB);
269             HB      = _mm_mul_ps(HB, _mm_mul_ps(epsB,epsB) );
270             FB      = _mm_add_ps(FB, _mm_add_ps( GB , HB ) );
271             YB      = _mm_add_ps(YB, _mm_mul_ps(FB, epsB));
272             FB      = _mm_add_ps(FB, _mm_add_ps(GB , _mm_mul_ps(HB,two)));
273             vgbB    = _mm_mul_ps(YB, qqB);           
274             fijGBB  = _mm_mul_ps(FB, _mm_mul_ps(qqB,gbscaleB));
275             
276             
277             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
278             dvdatmpB = _mm_mul_ps(_mm_add_ps(vgbB, _mm_mul_ps(fijGBB,rB)) , minushalf);
279
280             vgbtot  = _mm_add_ps(vgbtot, vgb);
281             vgbtot  = _mm_add_ps(vgbtot, vgbB);
282             
283             dvdasum = _mm_add_ps(dvdasum, dvdatmp);
284             dvdasum = _mm_add_ps(dvdasum, dvdatmpB);
285             dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
286             dvdatmpB = _mm_mul_ps(dvdatmpB, _mm_mul_ps(isajB,isajB));
287             
288             GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
289             GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrE,dvda+jnrF,dvda+jnrG,dvda+jnrH,dvdatmpB);
290                         
291                         rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
292                         rinvsixB     = _mm_mul_ps(rinvsqB,rinvsqB);
293                         rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
294                         rinvsixB     = _mm_mul_ps(rinvsixB,rinvsqB);
295                         
296                         vvdw6        = _mm_mul_ps(c6,rinvsix);
297                         vvdw6B       = _mm_mul_ps(c6B,rinvsixB);
298                         vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
299                         vvdw12B      = _mm_mul_ps(c12B, _mm_mul_ps(rinvsixB,rinvsixB));
300                         vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
301                         vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12B,vvdw6B));
302
303             fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
304                                                        _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
305                                                                   _mm_mul_ps(six,vvdw6))),
306                                       _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
307
308             fscalB       = _mm_sub_ps(_mm_mul_ps(rinvsqB, 
309                                                  _mm_sub_ps(_mm_mul_ps(twelve,vvdw12B),
310                                                             _mm_mul_ps(six,vvdw6B))),
311                                       _mm_mul_ps( _mm_sub_ps( fijGBB,fscalB),rinvB ));
312             
313             /***********************************/
314                         /*  INTERACTION SECTION ENDS HERE  */
315                         /***********************************/
316
317             /* Calculate temporary vectorial force */
318             tx           = _mm_mul_ps(fscal,dx);
319             ty           = _mm_mul_ps(fscal,dy);
320             tz           = _mm_mul_ps(fscal,dz);
321             txB          = _mm_mul_ps(fscalB,dxB);
322             tyB          = _mm_mul_ps(fscalB,dyB);
323             tzB          = _mm_mul_ps(fscalB,dzB);
324             
325             /* Increment i atom force */
326             fix          = _mm_add_ps(fix,tx);
327             fiy          = _mm_add_ps(fiy,ty);
328             fiz          = _mm_add_ps(fiz,tz);
329             fix          = _mm_add_ps(fix,txB);
330             fiy          = _mm_add_ps(fiy,tyB);
331             fiz          = _mm_add_ps(fiz,tzB);
332             
333             /* Store j forces back */
334                         GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
335                         GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3E,faction+j3F,faction+j3G,faction+j3H,txB,tyB,tzB);
336         }
337         for(;k<nj1-3; k+=4)
338                 {
339                         jnrA        = jjnr[k];   
340                         jnrB        = jjnr[k+1];
341                         jnrC        = jjnr[k+2];
342                         jnrD        = jjnr[k+3];
343             
344             j3A         = 3*jnrA;  
345                         j3B         = 3*jnrB;
346                         j3C         = 3*jnrC;
347                         j3D         = 3*jnrD;
348             
349             
350             GMX_MM_LOAD_1RVEC_4POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx,jy,jz);
351             
352                         dx           = _mm_sub_ps(ix,jx);
353                         dy           = _mm_sub_ps(iy,jy);
354                         dz           = _mm_sub_ps(iz,jz);
355                         
356                         rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
357                         
358                         /* invsqrt */                                                           
359                         rinv         = gmx_mm_invsqrt_ps(rsq);
360                         rinvsq       = _mm_mul_ps(rinv,rinv);
361             
362                         /***********************************/
363                         /* INTERACTION SECTION STARTS HERE */
364                         /***********************************/
365                         GMX_MM_LOAD_4VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,charge+jnrD,jq);
366                         GMX_MM_LOAD_4VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,invsqrta+jnrD,isaj);
367             
368             /* Lennard-Jones */
369             tjA          = nti+2*type[jnrA];
370                         tjB          = nti+2*type[jnrB];
371                         tjC          = nti+2*type[jnrC];
372                         tjD          = nti+2*type[jnrD];
373             
374             GMX_MM_LOAD_4PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,vdwparam+tjD,c6,c12);
375                         
376                         isaprod      = _mm_mul_ps(isai,isaj);
377                         qq           = _mm_mul_ps(iq,jq);            
378                         vcoul        = _mm_mul_ps(qq,rinv);
379                         fscal        = _mm_mul_ps(vcoul,rinv);                                 
380             vctot        = _mm_add_ps(vctot,vcoul);
381             
382             /* Polarization interaction */
383                         qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
384                         gbscale      = _mm_mul_ps(isaprod,gbtabscale);
385             
386                         /* Calculate GB table index */
387                         r            = _mm_mul_ps(rsq,rinv);
388                         rtab         = _mm_mul_ps(r,gbscale);
389                         
390                         n0           = _mm_cvttps_epi32(rtab);
391             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
392                         nnn          = _mm_slli_epi32(n0,2);
393                         
394             /* the tables are 16-byte aligned, so we can use _mm_load_ps */                     
395             Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
396             F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
397             G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
398             H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
399             _MM_TRANSPOSE4_PS(Y,F,G,H);
400             
401             G       = _mm_mul_ps(G,eps);
402             H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
403             F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
404             Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
405             F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
406             vgb     = _mm_mul_ps(Y, qq);           
407             fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
408             
409             
410             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);
411             
412             vgbtot  = _mm_add_ps(vgbtot, vgb);
413             
414             dvdasum = _mm_add_ps(dvdasum, dvdatmp);
415             dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
416             
417             GMX_MM_INCREMENT_4VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvda+jnrD,dvdatmp);
418             
419                         
420                         rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
421                         rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
422                         
423                         vvdw6        = _mm_mul_ps(c6,rinvsix);
424                         vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
425                         vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
426             
427             fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
428                                                  _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
429                                                             _mm_mul_ps(six,vvdw6))),
430                                       _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
431             
432             /***********************************/
433                         /*  INTERACTION SECTION ENDS HERE  */
434                         /***********************************/
435             
436             /* Calculate temporary vectorial force */
437             tx           = _mm_mul_ps(fscal,dx);
438             ty           = _mm_mul_ps(fscal,dy);
439             tz           = _mm_mul_ps(fscal,dz);
440             
441             /* Increment i atom force */
442             fix          = _mm_add_ps(fix,tx);
443             fiy          = _mm_add_ps(fiy,ty);
444             fiz          = _mm_add_ps(fiz,tz);
445             
446             /* Store j forces back */
447                         GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,tx,ty,tz);
448         }
449         
450         offset = (nj1-nj0)%4;
451         
452                 if(offset!=0)
453         {
454             /* Epilogue loop to take care of non-4-multiple j particles */
455             if(offset==1)
456             {
457                 jnrA        = jjnr[k];   
458                 j3A         = 3*jnrA;  
459                 tjA          = nti+2*type[jnrA];
460                 GMX_MM_LOAD_1RVEC_1POINTER_PS(pos+j3A,jx,jy,jz);
461                 GMX_MM_LOAD_1VALUE_PS(charge+jnrA,jq);
462                 GMX_MM_LOAD_1VALUE_PS(invsqrta+jnrA,isaj);
463                 GMX_MM_LOAD_1PAIR_PS(vdwparam+tjA,c6,c12);
464                 mask        = mask1;
465             } 
466             else if(offset==2)
467             {
468                 jnrA        = jjnr[k];   
469                 jnrB        = jjnr[k+1];
470                 j3A         = 3*jnrA;  
471                 j3B         = 3*jnrB;
472                 tjA          = nti+2*type[jnrA];
473                 tjB          = nti+2*type[jnrB];
474                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(pos+j3A,pos+j3B,jx,jy,jz);
475                 GMX_MM_LOAD_2VALUES_PS(charge+jnrA,charge+jnrB,jq);
476                 GMX_MM_LOAD_2VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,isaj);
477                 GMX_MM_LOAD_2PAIRS_PS(vdwparam+tjA,vdwparam+tjB,c6,c12);
478                 mask        = mask2;
479             }
480             else
481             {
482                 /* offset must be 3 */   
483                 jnrA        = jjnr[k];   
484                 jnrB        = jjnr[k+1];
485                 jnrC        = jjnr[k+2];
486                 j3A         = 3*jnrA;  
487                 j3B         = 3*jnrB;
488                 j3C         = 3*jnrC;
489                 tjA          = nti+2*type[jnrA];
490                 tjB          = nti+2*type[jnrB];
491                 tjC          = nti+2*type[jnrC];
492                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(pos+j3A,pos+j3B,pos+j3C,jx,jy,jz);
493                 GMX_MM_LOAD_3VALUES_PS(charge+jnrA,charge+jnrB,charge+jnrC,jq);
494                 GMX_MM_LOAD_3VALUES_PS(invsqrta+jnrA,invsqrta+jnrB,invsqrta+jnrC,isaj);
495                 GMX_MM_LOAD_3PAIRS_PS(vdwparam+tjA,vdwparam+tjB,vdwparam+tjC,c6,c12);
496                 mask        = mask3;
497             }
498             
499             dx           = _mm_sub_ps(ix,jx);
500                         dy           = _mm_sub_ps(iy,jy);
501                         dz           = _mm_sub_ps(iz,jz);
502                         
503                         rsq          = gmx_mm_calc_rsq_ps(dx,dy,dz);
504                         
505                         /* invsqrt */                                                           
506                         rinv         = gmx_mm_invsqrt_ps(rsq);
507             rinv         = _mm_and_ps(rinv,mask);
508             
509                         rinvsq       = _mm_mul_ps(rinv,rinv);
510             
511                         /***********************************/
512                         /* INTERACTION SECTION STARTS HERE */
513                         /***********************************/
514             isaprod      = _mm_mul_ps(isai,isaj);
515                         qq           = _mm_and_ps(_mm_mul_ps(iq,jq),mask);        
516                         vcoul        = _mm_mul_ps(qq,rinv);
517                         fscal        = _mm_mul_ps(vcoul,rinv);                                 
518             vctot        = _mm_add_ps(vctot,vcoul);
519             
520             /* Polarization interaction */
521                         qq           = _mm_mul_ps(qq,_mm_mul_ps(isaprod,gbfactor));
522                         gbscale      = _mm_mul_ps(isaprod,gbtabscale);
523             
524                         /* Calculate GB table index */
525                         r            = _mm_mul_ps(rsq,rinv);
526                         rtab         = _mm_mul_ps(r,gbscale);
527                         
528                         n0           = _mm_cvttps_epi32(rtab);
529             eps          = _mm_sub_ps(rtab , _mm_cvtepi32_ps(n0) );
530                         nnn          = _mm_slli_epi32(n0,2);
531                         
532             /* the tables are 16-byte aligned, so we can use _mm_load_ps */                     
533             Y            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,0)); /* YFGH */
534             F            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,1)); /* YFGH */
535             G            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,2)); /* YFGH */
536             H            = _mm_load_ps(GBtab + gmx_mm_extract_epi32(nnn,3)); /* YFGH */
537             _MM_TRANSPOSE4_PS(Y,F,G,H);
538             
539             G       = _mm_mul_ps(G,eps);
540             H       = _mm_mul_ps(H, _mm_mul_ps(eps,eps) );
541             F       = _mm_add_ps(F, _mm_add_ps( G , H ) );
542             Y       = _mm_add_ps(Y, _mm_mul_ps(F, eps));
543             F       = _mm_add_ps(F, _mm_add_ps(G , _mm_mul_ps(H,two)));
544             vgb     = _mm_mul_ps(Y, qq);           
545             fijGB   = _mm_mul_ps(F, _mm_mul_ps(qq,gbscale));
546             
547             dvdatmp = _mm_mul_ps(_mm_add_ps(vgb, _mm_mul_ps(fijGB,r)) , minushalf);            
548             vgbtot  = _mm_add_ps(vgbtot, vgb);
549             
550             dvdasum = _mm_add_ps(dvdasum, dvdatmp);
551             dvdatmp = _mm_mul_ps(dvdatmp, _mm_mul_ps(isaj,isaj));
552             
553             /* Lennard-Jones */
554             
555                         rinvsix      = _mm_mul_ps(rinvsq,rinvsq);
556                         rinvsix      = _mm_mul_ps(rinvsix,rinvsq);
557                         
558                         vvdw6        = _mm_mul_ps(c6,rinvsix);
559                         vvdw12       = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
560                         vvdwtot      = _mm_add_ps(vvdwtot,_mm_sub_ps(vvdw12,vvdw6));
561             
562             fscal        = _mm_sub_ps(_mm_mul_ps(rinvsq, 
563                                                  _mm_sub_ps(_mm_mul_ps(twelve,vvdw12),
564                                                             _mm_mul_ps(six,vvdw6))),
565                                       _mm_mul_ps( _mm_sub_ps( fijGB,fscal),rinv ));
566             
567             /***********************************/
568                         /*  INTERACTION SECTION ENDS HERE  */
569                         /***********************************/
570             
571             /* Calculate temporary vectorial force */
572             tx           = _mm_mul_ps(fscal,dx);
573             ty           = _mm_mul_ps(fscal,dy);
574             tz           = _mm_mul_ps(fscal,dz);
575             
576             /* Increment i atom force */
577             fix          = _mm_add_ps(fix,tx);
578             fiy          = _mm_add_ps(fiy,ty);
579             fiz          = _mm_add_ps(fiz,tz);
580             
581             if(offset==1)
582             {
583                 GMX_MM_INCREMENT_1VALUE_PS(dvda+jnrA,dvdatmp);
584                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(faction+j3A,tx,ty,tz);
585             } 
586             else if(offset==2)
587             {
588                 GMX_MM_INCREMENT_2VALUES_PS(dvda+jnrA,dvda+jnrB,dvdatmp);
589                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(faction+j3A,faction+j3B,tx,ty,tz);
590             }
591             else
592             {
593                 /* offset must be 3 */
594                 GMX_MM_INCREMENT_3VALUES_PS(dvda+jnrA,dvda+jnrB,dvda+jnrC,dvdatmp);
595                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(faction+j3A,faction+j3B,faction+j3C,tx,ty,tz);
596             }
597         }
598         
599         dvdasum = _mm_mul_ps(dvdasum, _mm_mul_ps(isai,isai));
600                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
601                 
602         ggid             = gid[n];         
603         GMX_MM_UPDATE_4POT_PS(vctot,vc+ggid,vvdwtot,vvdw+ggid,vgbtot,gpol+ggid,dvdasum,dvda+ii);
604     }
605         
606         *outeriter       = nri;            
607     *inneriter       = nj1;            
608 }
609
610
611
612 /*
613  * Gromacs nonbonded kernel nb_kernel410nf
614  * Coulomb interaction:     Generalized-Born
615  * VdW interaction:         Lennard-Jones
616  * water optimization:      No
617  * Calculate forces:        no
618  */
619 void nb_kernel410nf_x86_64_sse(
620                     int *           p_nri,
621                     int *           iinr,
622                     int *           jindex,
623                     int *           jjnr,
624                     int *           shift,
625                     float *         shiftvec,
626                     float *         fshift,
627                     int *           gid,
628                     float *         pos,
629                     float *         faction,
630                     float *         charge,
631                     float *         p_facel,
632                     float *         p_krf,
633                     float *         p_crf,
634                     float *         Vc,
635                     int *           type,
636                     int *           p_ntype,
637                     float *         vdwparam,
638                     float *         vvdw,
639                     float *         p_tabscale,
640                     float *         VFtab,
641                     float *         invsqrta,
642                     float *         dvda,
643                     float *         p_gbtabscale,
644                     float *         GBtab,
645                     int *           p_nthreads,
646                     int *           count,
647                     void *          mtx,
648                     int *           outeriter,
649                     int *           inneriter,
650                     float *         work)
651 {
652     int           nri,ntype,nthreads;
653     float         facel,krf,crf,tabscale,gbtabscale,vgb,fgb;
654     int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
655     float         shX,shY,shZ;
656     float         rinvsq;
657     float         iq;
658     float         qq,vcoul,vctot;
659     int           nti;
660     int           tj;
661     float         rinvsix;
662     float         vvdw6,vvdwtot;
663     float         vvdw12;
664     float         r,rt,eps,eps2;
665     int           n0,nnn;
666     float         Y,F,Geps,Heps2,Fp,VV;
667     float         isai,isaj,isaprod,gbscale;
668     float         ix1,iy1,iz1;
669     float         jx1,jy1,jz1;
670     float         dx11,dy11,dz11,rsq11,rinv11;
671     float         c6,c12;
672     const int     fractshift = 12;
673     const int     fractmask = 8388607;
674     const int     expshift = 23;
675     const int     expmask = 2139095040;
676     const int     explsb = 8388608;
677     float         lu;
678     int           iexp,addr;
679     union { unsigned int bval; float fval; } bitpattern,result;
680
681     nri              = *p_nri;         
682     ntype            = *p_ntype;       
683     nthreads         = *p_nthreads;    
684     facel            = *p_facel;       
685     krf              = *p_krf;         
686     crf              = *p_crf;         
687     tabscale         = *p_tabscale;    
688     gbtabscale       = *p_gbtabscale;  
689     nj1              = 0;              
690
691     for(n=0; (n<nri); n++)
692     {
693         is3              = 3*shift[n];     
694         shX              = shiftvec[is3];  
695         shY              = shiftvec[is3+1];
696         shZ              = shiftvec[is3+2];
697         nj0              = jindex[n];      
698         nj1              = jindex[n+1];    
699         ii               = iinr[n];        
700         ii3              = 3*ii;           
701         ix1              = shX + pos[ii3+0];
702         iy1              = shY + pos[ii3+1];
703         iz1              = shZ + pos[ii3+2];
704         iq               = facel*charge[ii];
705         isai             = invsqrta[ii];   
706         nti              = 2*ntype*type[ii];
707         vctot            = 0;              
708         vvdwtot          = 0;              
709         
710         for(k=nj0; (k<nj1); k++)
711         {
712             jnr              = jjnr[k];        
713             j3               = 3*jnr;          
714             jx1              = pos[j3+0];      
715             jy1              = pos[j3+1];      
716             jz1              = pos[j3+2];      
717             dx11             = ix1 - jx1;      
718             dy11             = iy1 - jy1;      
719             dz11             = iz1 - jz1;      
720             rsq11            = dx11*dx11+dy11*dy11+dz11*dz11;
721             bitpattern.fval  = rsq11;          
722             iexp             = (((bitpattern.bval)&expmask)>>expshift);
723             addr             = (((bitpattern.bval)&(fractmask|explsb))>>fractshift);
724             result.bval      = gmx_invsqrt_exptab[iexp] | gmx_invsqrt_fracttab[addr];
725             lu               = result.fval;    
726             rinv11           = (0.5*lu*(3.0-((rsq11*lu)*lu)));
727             isaj             = invsqrta[jnr];  
728             isaprod          = isai*isaj;      
729             qq               = iq*charge[jnr]; 
730             vcoul            = qq*rinv11;      
731             qq               = isaprod*(-qq);  
732             gbscale          = isaprod*gbtabscale;
733             tj               = nti+2*type[jnr];
734             c6               = vdwparam[tj];   
735             c12              = vdwparam[tj+1]; 
736             rinvsq           = rinv11*rinv11;  
737             r                = rsq11*rinv11;   
738             rt               = r*gbscale;      
739             n0               = rt;             
740             eps              = rt-n0;          
741             eps2             = eps*eps;        
742             nnn              = 4*n0;           
743             Y                = GBtab[nnn];     
744             F                = GBtab[nnn+1];   
745             Geps             = eps*GBtab[nnn+2];
746             Heps2            = eps2*GBtab[nnn+3];
747             Fp               = F+Geps+Heps2;   
748             VV               = Y+eps*Fp;       
749             vgb              = qq*VV;          
750             vctot            = vctot + vcoul;  
751             rinvsix          = rinvsq*rinvsq*rinvsq;
752             vvdw6            = c6*rinvsix;     
753             vvdw12           = c12*rinvsix*rinvsix;
754             vvdwtot          = vvdwtot+vvdw12-vvdw6;
755         }
756         
757         ggid             = gid[n];         
758         Vc[ggid]         = Vc[ggid] + vctot;
759         vvdw[ggid]       = vvdw[ggid] + vvdwtot;
760     }
761     
762     *outeriter       = nri;            
763     *inneriter       = nj1;            
764 }
765
766