Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel400_sse2_double.c
1 /*
2  * Copyright (c) Erik Lindahl, David van der Spoel 2003
3  * 
4  * This file is generated automatically at compile time
5  * by the program mknb in the Gromacs distribution.
6  *
7  * Options used when generation this file:
8  * Language:         c
9  * Precision:        double
10  * Threads:          no
11  * Software invsqrt: yes
12  * Prefetch forces:  no
13  * Comments:         no
14  */
15 #ifdef HAVE_CONFIG_H
16 #include<config.h>
17 #endif
18 #include<math.h>
19 #include<vec.h>
20
21 #include <emmintrin.h>
22
23 #include <gmx_sse2_double.h>
24
25 /* get gmx_gbdata_t */
26 #include "../nb_kerneltype.h"
27
28
29 void nb_kernel400_sse2_double(int *           p_nri,
30                               int *           iinr,
31                               int *           jindex,
32                               int *           jjnr,
33                               int *           shift,
34                               double *         shiftvec,
35                               double *         fshift,
36                               int *           gid,
37                               double *         pos,
38                               double *         faction,
39                               double *         charge,
40                               double *         p_facel,
41                               double *         p_krf,
42                               double *         p_crf,
43                               double *         vc,
44                               int *           type,
45                               int *           p_ntype,
46                               double *         vdwparam,
47                               double *         vvdw,
48                               double *         p_tabscale,
49                               double *         VFtab,
50                               double *         invsqrta,
51                               double *         dvda,
52                               double *         p_gbtabscale,
53                               double *         GBtab,
54                               int *           p_nthreads,
55                               int *           count,
56                               void *          mtx,
57                               int *           outeriter,
58                               int *           inneriter,
59                               double *         work)
60 {
61   int           nri,nthreads;
62   int           n,ii,is3,ii3,k,nj0,nj1,ggid;
63   double        shX,shY,shZ;
64   int           jnrA,jnrB;
65   int           j3A,j3B;
66         gmx_gbdata_t *gbdata;
67         double *      gpol;
68     
69         __m128d  iq,qq,jq,isai;
70         __m128d  ix,iy,iz;
71         __m128d  jx,jy,jz;
72         __m128d  dx,dy,dz;
73         __m128d  vctot,vgbtot,dvdasum,gbfactor;
74         __m128d  fix,fiy,fiz,tx,ty,tz,rsq;
75         __m128d  rinv,isaj,isaprod;
76         __m128d  vcoul,fscal,gbscale;
77         __m128d  rinvsq,r,rtab;
78         __m128d  eps,Y,F,G,H;
79         __m128d  vgb,fijGB,dvdatmp;
80         __m128d  facel,gbtabscale,dvdaj;
81         __m128i  n0, nnn;
82         
83         const __m128d neg        = _mm_set1_pd(-1.0);
84         const __m128d zero       = _mm_set1_pd(0.0);
85         const __m128d minushalf  = _mm_set1_pd(-0.5);
86         const __m128d two        = _mm_set1_pd(2.0);
87         
88         gbdata     = (gmx_gbdata_t *)work;
89         gpol       = gbdata->gpol;
90     
91         nri        = *p_nri;
92     
93   gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
94   gbtabscale = _mm_load1_pd(p_gbtabscale);  
95   facel      = _mm_load1_pd(p_facel);
96   
97   nj1         = 0;
98   jnrA = jnrB = 0;
99   j3A = j3B   = 0;
100   jx          = _mm_setzero_pd();
101   jy          = _mm_setzero_pd();
102   jz          = _mm_setzero_pd();
103         
104         for(n=0;n<nri;n++)
105         {
106     is3              = 3*shift[n];     
107     shX              = shiftvec[is3];  
108     shY              = shiftvec[is3+1];
109     shZ              = shiftvec[is3+2];
110     nj0              = jindex[n];      
111     nj1              = jindex[n+1];    
112     ii               = iinr[n];        
113     ii3              = 3*ii;           
114                 
115                 ix               = _mm_set1_pd(shX+pos[ii3+0]);
116                 iy               = _mm_set1_pd(shY+pos[ii3+1]);
117                 iz               = _mm_set1_pd(shZ+pos[ii3+2]);
118     
119                 iq               = _mm_load1_pd(charge+ii);
120                 iq               = _mm_mul_pd(iq,facel);
121     
122                 isai             = _mm_load1_pd(invsqrta+ii);
123                         
124                 vctot            = _mm_setzero_pd();
125                 vgbtot           = _mm_setzero_pd();
126                 dvdasum          = _mm_setzero_pd();
127                 fix              = _mm_setzero_pd();
128                 fiy              = _mm_setzero_pd();
129                 fiz              = _mm_setzero_pd();
130                 
131                 for(k=nj0;k<nj1-1; k+=2)
132                 {
133                         jnrA    = jjnr[k];
134                         jnrB    = jjnr[k+1];
135                         
136                         j3A     = jnrA * 3;
137                         j3B     = jnrB * 3;
138       
139       GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
140             
141                         dx           = _mm_sub_pd(ix,jx);
142                         dy           = _mm_sub_pd(iy,jy);
143                         dz           = _mm_sub_pd(iz,jz);
144             
145       rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
146       
147       rinv         = gmx_mm_invsqrt_pd(rsq);
148                         rinvsq       = _mm_mul_pd(rinv,rinv);
149       
150                         /***********************************/
151                         /* INTERACTION SECTION STARTS HERE */
152                         /***********************************/
153                         GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
154                         GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
155             
156                         isaprod      = _mm_mul_pd(isai,isaj);
157                         qq           = _mm_mul_pd(iq,jq);            
158                         vcoul        = _mm_mul_pd(qq,rinv);
159                         fscal        = _mm_mul_pd(vcoul,rinv);                                 
160       vctot        = _mm_add_pd(vctot,vcoul);
161             
162             /* Polarization interaction */
163                         qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
164                         gbscale      = _mm_mul_pd(isaprod,gbtabscale);
165             
166                         /* Calculate GB table index */
167                         r            = _mm_mul_pd(rsq,rinv);
168                         rtab         = _mm_mul_pd(r,gbscale);
169                         
170                         n0                   = _mm_cvttpd_epi32(rtab);
171                         eps              = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
172                         nnn                  = _mm_slli_epi32(n0,2);
173                         
174       /* the tables are 16-byte aligned, so we can use _mm_load_pd */                   
175       Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
176       F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
177       GMX_MM_TRANSPOSE2_PD(Y,F);
178       G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
179       H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
180       GMX_MM_TRANSPOSE2_PD(G,H);
181       
182       G       = _mm_mul_pd(G,eps);
183       H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
184       F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
185       Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
186       F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
187       vgb     = _mm_mul_pd(Y, qq);           
188       fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
189       
190       dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
191       
192       vgbtot  = _mm_add_pd(vgbtot, vgb);
193       
194       dvdasum = _mm_add_pd(dvdasum, dvdatmp);
195       dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
196       
197       GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
198       
199       fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
200       
201       /***********************************/
202                         /*  INTERACTION SECTION ENDS HERE  */
203                         /***********************************/
204       
205       /* Calculate temporary vectorial force */
206       tx           = _mm_mul_pd(fscal,dx);
207       ty           = _mm_mul_pd(fscal,dy);
208       tz           = _mm_mul_pd(fscal,dz);
209       
210       /* Increment i atom force */
211       fix          = _mm_add_pd(fix,tx);
212       fiy          = _mm_add_pd(fiy,ty);
213       fiz          = _mm_add_pd(fiz,tz);
214       
215       /* Store j forces back */
216                         GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
217                 }
218                 
219                 /* In double precision, offset can only be either 0 or 1 */
220                 if(k<nj1)
221                 {
222                         jnrA    = jjnr[k];
223                         j3A     = jnrA * 3;
224       
225       GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
226       
227                         dx           = _mm_sub_sd(ix,jx);
228                         dy           = _mm_sub_sd(iy,jy);
229                         dz           = _mm_sub_sd(iz,jz);
230       
231       rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
232       
233       rinv         = gmx_mm_invsqrt_pd(rsq);
234                         rinvsq       = _mm_mul_sd(rinv,rinv);
235       
236       /* These reason for zeroing these variables here is for fixing bug 585
237        * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
238        * and r1=0, but it should be r1=a[1]. 
239        * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
240        * To work around it, we zero these variables and use _mm_add_pd (**) instead
241        * Note that the only variables that get affected are the energies since
242        * the total sum needs to be correct 
243        */
244       vgb          = _mm_setzero_pd();
245       vcoul        = _mm_setzero_pd();
246       dvdatmp      = _mm_setzero_pd();
247       
248                         /***********************************/
249                         /* INTERACTION SECTION STARTS HERE */
250                         /***********************************/
251                         GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
252                         GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
253       
254                         isaprod      = _mm_mul_sd(isai,isaj);
255                         qq           = _mm_mul_sd(jq,iq);            
256                         vcoul        = _mm_mul_sd(qq,rinv);
257                         fscal        = _mm_mul_sd(vcoul,rinv);                                 
258       vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
259       
260       /* Polarization interaction */
261                         qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
262                         gbscale      = _mm_mul_sd(isaprod,gbtabscale);
263       
264                         /* Calculate GB table index */
265                         r            = _mm_mul_sd(rsq,rinv);
266                         rtab         = _mm_mul_sd(r,gbscale);
267       
268                         n0                   = _mm_cvttpd_epi32(rtab);
269                         eps              = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
270                         nnn                  = _mm_slli_epi32(n0,2);
271                         
272       /* the tables are 16-byte aligned, so we can use _mm_load_pd */                   
273       Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
274       F            = _mm_setzero_pd();
275       GMX_MM_TRANSPOSE2_PD(Y,F);
276       G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
277       H            = _mm_setzero_pd();
278       GMX_MM_TRANSPOSE2_PD(G,H);
279       
280       G       = _mm_mul_sd(G,eps);
281       H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
282       F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
283       Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
284       F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
285       vgb     = _mm_mul_sd(Y, qq);           
286       fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
287       
288       dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
289       
290       vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
291       
292       dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
293       dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
294       
295       GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
296                         
297       fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
298       
299       /***********************************/
300                         /*  INTERACTION SECTION ENDS HERE  */
301                         /***********************************/
302       
303       /* Calculate temporary vectorial force */
304       tx           = _mm_mul_sd(fscal,dx);
305       ty           = _mm_mul_sd(fscal,dy);
306       tz           = _mm_mul_sd(fscal,dz);
307       
308       /* Increment i atom force */
309       fix          = _mm_add_sd(fix,tx);
310       fiy          = _mm_add_sd(fiy,ty);
311       fiz          = _mm_add_sd(fiz,tz);
312       
313       /* Store j forces back */
314                         GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
315                 }
316                 
317     dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
318     gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
319     
320     ggid     = gid[n];         
321     
322     gmx_mm_update_1pot_pd(vctot,vc+ggid);
323     gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
324     gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
325   }
326   
327         *outeriter   = nri;            
328   *inneriter   = nj1;   
329 }