Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel110_sse2_single.c
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7  * Copyright (c) 2001-2009, The GROMACS Development Team
8  *
9  * Gromacs is a library for molecular simulation and trajectory analysis,
10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11  * a full list of developers and information, check out http://www.gromacs.org
12  *
13  * This program is free software; you can redistribute it and/or modify it under 
14  * the terms of the GNU Lesser General Public License as published by the Free 
15  * Software Foundation; either version 2 of the License, or (at your option) any 
16  * later version.
17  * As a special exception, you may use this file as part of a free software
18  * library without restriction.  Specifically, if other files instantiate
19  * templates or use macros or inline functions from this file, or you compile
20  * this file and link it with other files to produce an executable, this
21  * file does not by itself cause the resulting executable to be covered by
22  * the GNU Lesser General Public License.  
23  *
24  * In plain-speak: do not worry about classes/macros/templates either - only
25  * changes to the library have to be LGPL, not an application linking with it.
26  *
27  * To help fund GROMACS development, we humbly ask that you cite
28  * the papers people have written on it - you can find them on the website!
29  */
30 #if 0
31
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
35
36
37 #include <stdio.h>
38
39 #include <emmintrin.h>
40
41 #include "sse_common_single.h"
42
43
44 /*
45  * Gromacs nonbonded kernel nb_kernel112
46  * Coulomb interaction:     Normal Coulomb
47  * VdW interaction:         Lennard-Jones
48  * water optimization:      pairs of SPC/TIP3P interactions
49  * Calculate forces:        yes
50  */
51 void nb_kernel110_sse2_single(
52                     int *           p_nri,
53                     int *           iinr,
54                     int *           jindex,
55                     int *           jjnr,
56                     int *           shift,
57                     float *         shiftvec,
58                     float *         fshift,
59                     int *           gid,
60                     float *         pos,
61                     float *         faction,
62                     float *         charge,
63                     float *         p_facel,
64                     float *         p_krf,
65                     float *         p_crf,
66                     float *         Vc,
67                     int *           type,
68                     int *           p_ntype,
69                     float *         vdwparam,
70                     float *         Vvdw,
71                     float *         p_tabscale,
72                     float *         VFtab,
73                     float *         invsqrta,
74                     float *         dvda,
75                     float *         p_gbtabscale,
76                     float *         GBtab,
77                     int *           p_nthreads,
78                     int *           count,
79                     void *          mtx,
80                     int *           outeriter,
81                     int *           inneriter,
82                     float *         work)
83 {
84     int           nri,ntype,nthreads,offset;
85     float         facel,krf,crf,tabscale,gbtabscale;
86     int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
87     float         shX,shY,shZ;
88     int           tj;
89     float         qO,qH;
90         int           j3A,j3B,j3C,j3D,jnrA,jnrB,jnrC,jnrD;
91         float         tfloat1[4],tfloat2[4],tfloat3[4];
92
93         __m128     mask1 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
94         __m128     mask2 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
95         __m128     mask3 = _mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
96         __m128     mask;
97         
98         __m128     qqOO,qqOH,qqHH,c6,c12,Vvdwtot,vctot;
99         __m128     dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
100         __m128     dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
101         __m128     dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
102         __m128     fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
103         __m128     fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
104         __m128     rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
105         __m128     tx,ty,tz;
106         __m128     ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
107         __m128     jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;  
108         __m128     t1,t2,t3;
109         __m128     rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
110         __m128     fscal11,fscal12,fscal13,fscal21,fscal22,fscal23,fscal31,fscal32,fscal33;
111         
112         __m128     half   = _mm_set1_ps(0.5);
113         __m128     three  = _mm_set1_ps(3.0);
114         __m128     six    = _mm_set1_ps(6.0);
115         __m128     twelve = _mm_set1_ps(12.0);
116         
117     nri              = *p_nri;         
118     ntype            = *p_ntype;       
119     nthreads         = *p_nthreads;    
120     facel            = *p_facel;       
121     krf              = *p_krf;         
122     crf              = *p_crf;         
123     tabscale         = *p_tabscale;    
124     ii               = iinr[0];        
125     qO               = charge[ii];     
126     qH               = charge[ii+1];   
127     qqOO             = _mm_set1_ps(facel*qO*qO);    
128     qqOH             = _mm_set1_ps(facel*qO*qH);    
129     qqHH             = _mm_set1_ps(facel*qH*qH);    
130     tj               = 2*(ntype+1)*type[ii];
131     c6               = _mm_set1_ps(vdwparam[tj]);   
132     c12              = _mm_set1_ps(vdwparam[tj+1]); 
133
134     nj1              = 0;              
135     
136     for(n=0; (n<nri); n++)
137     {
138         is3              = 3*shift[n];     
139         shX              = shiftvec[is3];  
140         shY              = shiftvec[is3+1];
141         shZ              = shiftvec[is3+2];
142         nj0              = jindex[n];      
143         nj1              = jindex[n+1];    
144         ii               = iinr[n];        
145         ii3              = 3*ii;           
146                 
147                 ix1      = _mm_set1_ps(shX+pos[ii3+0]);
148                 iy1      = _mm_set1_ps(shY+pos[ii3+1]);
149                 iz1      = _mm_set1_ps(shZ+pos[ii3+2]); 
150
151         vctot            = _mm_setzero_ps();              
152         Vvdwtot          = _mm_setzero_ps();               
153         fix1             = _mm_setzero_ps(); 
154         fiy1             = _mm_setzero_ps();               
155         fiz1             = _mm_setzero_ps();               
156         
157         for(k=nj0; k<nj1-3; k+=4)
158         {
159                         jnrA             = jjnr[k];
160                         jnrB             = jjnr[k+1];
161                         jnrC             = jjnr[k+2];
162                         jnrD             = jjnr[k+3];
163                         
164                         j3A              = jnrA * 3;
165                         j3B              = jnrB * 3;
166                         j3C              = jnrC * 3;
167                         j3D              = jnrD * 3;
168                         
169                         GMX_MM_LOAD_4JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx1,jy1,jz1);
170                         
171                         dx11   = _mm_sub_ps(ix1,jx1);
172                         dy11   = _mm_sub_ps(iy1,jy1);
173                         dz11   = _mm_sub_ps(iz1,jz1);
174                         
175                         rsq11  = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
176                         
177                         /* invsqrt */                                                           
178                         rinv11 = gmx_mm_invsqrt_ps(rsq11);
179
180                         /***********************************/
181                         /* INTERACTION SECTION STARTS HERE */
182                         /***********************************/
183                         
184                         fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
185                                                                  gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
186                         fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /*  *=rinvsq */
187                         
188                         /***********************************/
189                         /*  INTERACTION SECTION ENDS HERE  */
190                         /***********************************/
191                         
192                         /* Calculate vectorial forces from fscal */
193                         
194                         tx      = _mm_mul_ps(fscal11,dx11); 
195                         ty      = _mm_mul_ps(fscal11,dy11); 
196                         tz      = _mm_mul_ps(fscal11,dz11); 
197                         fix1    = _mm_add_ps(fix1,tx);      
198                         fiy1    = _mm_add_ps(fiy1,ty);      
199                         fiz1    = _mm_add_ps(fiz1,tz);     
200                         fjx1    = tx;
201                         fjy1    = ty;
202                         fjz1    = tz;
203                         
204                         /* Store j forces back */
205                         GMX_MM_UPDATE_4JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,fjx1,fjy1,fjz1);
206         }
207                 
208         /* Treat offset particles here */
209                 if(k<nj1)
210         {
211         
212                         if(k<nj1-2)
213                         {
214                                 jnrA             = jjnr[k];
215                                 jnrB             = jjnr[k+1];
216                                 jnrC             = jjnr[k+2];
217                                 j3A              = jnrA * 3;
218                                 j3B              = jnrB * 3;
219                                 j3C              = jnrC * 3;
220                                 GMX_MM_LOAD_3JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,jx1,jy1,jz1);                         
221                         }
222                         else if(k<nj1-1)
223                         {
224                                 jnrA             = jjnr[k];
225                                 jnrB             = jjnr[k+1];
226                                 j3A              = jnrA * 3;
227                                 j3B              = jnrB * 3;
228                                 GMX_MM_LOAD_2JCOORD_1ATOM_PS(pos+j3A,pos+j3B,jx1,jy1,jz1);
229                         }
230                         else
231                         {
232                                 jnrA             = jjnr[k];
233                                 j3A              = jnrA * 3;
234                                 GMX_MM_LOAD_1JCOORD_1ATOM_PS(pos+j3A,jx1,jy1,jz1);
235                         }
236                         
237                         dx11   = _mm_sub_ps(ix1,jx1);
238                         dy11   = _mm_sub_ps(iy1,jy1);
239                         dz11   = _mm_sub_ps(iz1,jz1);
240                         
241                         rsq11  = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
242                         
243                         /* invsqrt */                                   
244                         rinv11 = _mm_and_ps(mask,gmx_mm_invsqrt_ps(rsq11));
245                         
246                         /***********************************/
247                         /* INTERACTION SECTION STARTS HERE */
248                         /***********************************/
249                         
250                         fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
251                                                                  gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
252                         fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /*  *=rinvsq */
253                         
254                         /***********************************/
255                         /*  INTERACTION SECTION ENDS HERE  */
256                         /***********************************/
257                         
258                         /* Calculate vectorial forces */
259                         tx      = _mm_mul_ps(fscal11,dx11);
260                         ty      = _mm_mul_ps(fscal11,dy11);
261                         tz      = _mm_mul_ps(fscal11,dz11);
262                         fix1    = _mm_add_ps(fix1,tx);
263                         fiy1    = _mm_add_ps(fiy1,ty);
264                         fiz1    = _mm_add_ps(fiz1,tz);
265                         fjx1    = tx;
266                         fjy1    = ty;
267                         fjz1    = tz;
268
269                         if(k<nj1-2)
270                         {
271                                 GMX_MM_UPDATE_3JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,fjx1,fjy1,fjz1);
272                         }
273                         else if(k<nj1-1)
274                         {
275                                 GMX_MM_UPDATE_2JCOORD_1ATOM_PS(faction+j3A,faction+j3B,fjx1,fjy1,fjz1);
276                         }
277                         else
278                         {
279                                 GMX_MM_UPDATE_1JCOORD_1ATOM_PS(faction+j3A,fjx1,fjy1,fjz1);
280                         }
281                 }
282                 
283                 gmx_mm_update_iforce_1atom_ps(&fix1,&fiy1,&fiz1,faction+ii3,fshift+is3);
284                 
285         ggid             = gid[n];         
286         GMX_MM_UPDATE_2POT_PS(vctot,Vc+ggid,Vvdwtot,Vvdw+ggid);
287     }
288     
289     *outeriter       = nri;            
290     *inneriter       = nj1;            
291 }
292
293 #else
294 int nb_kernel110_sse2_single_dummy;
295 #endif