Update copyright statements and change license to LGPL
[alexxy/gromacs.git] / src / mdlib / pme_sse_single.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team,
6  * check out http://www.gromacs.org for more information.
7  * Copyright (c) 2012, by the GROMACS development team, led by
8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9  * others, as listed in the AUTHORS file in the top-level source
10  * directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38
39 /* This include file has code between ifdef's to make sure
40  * that this performance sensitive code is inlined
41  * and to remove conditionals and variable loop bounds at compile time.
42  */
43
44 #ifdef PME_SPREAD_SSE_ORDER4
45 /* This code does not assume any memory alignment.
46  * This code only works for pme_order = 4.
47  */
48 {
49     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
50     __m128 tz_SSE;
51     __m128 vx_SSE;
52     __m128 vx_tz_SSE;
53     __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
54     __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
55
56     ty_SSE0 = _mm_load1_ps(&thy[0]);
57     ty_SSE1 = _mm_load1_ps(&thy[1]);
58     ty_SSE2 = _mm_load1_ps(&thy[2]);
59     ty_SSE3 = _mm_load1_ps(&thy[3]);
60
61     tz_SSE  = _mm_loadu_ps(thz);
62
63     for(ithx=0; (ithx<4); ithx++)
64     {
65         index_x = (i0+ithx)*pny*pnz;
66         valx    = qn*thx[ithx];
67
68         vx_SSE   = _mm_load1_ps(&valx);
69
70         vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
71
72         gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
73         gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
74         gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
75         gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
76
77         sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
78         sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
79         sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
80         sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
81
82         _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
83         _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
84         _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
85         _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
86     }
87 }
88 #undef PME_SPREAD_SSE_ORDER4
89 #endif
90
91
92 #ifdef PME_GATHER_F_SSE_ORDER4
93 /* This code does not assume any memory alignment.
94  * This code only works for pme_order = 4.
95  */
96 {
97     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
98
99     __m128 fx_SSE,fy_SSE,fz_SSE;
100
101     __m128 tx_SSE,ty_SSE,tz_SSE;
102     __m128 dx_SSE,dy_SSE,dz_SSE;
103
104     __m128 gval_SSE;
105
106     __m128 fxy1_SSE;
107     __m128 fz1_SSE;
108
109     fx_SSE = _mm_setzero_ps();
110     fy_SSE = _mm_setzero_ps();
111     fz_SSE = _mm_setzero_ps();
112
113     tz_SSE  = _mm_loadu_ps(thz);
114     dz_SSE  = _mm_loadu_ps(dthz);
115
116     for(ithx=0; (ithx<4); ithx++)
117     {
118         index_x = (i0+ithx)*pny*pnz;
119         tx_SSE   = _mm_load1_ps(thx+ithx);
120         dx_SSE   = _mm_load1_ps(dthx+ithx);
121
122         for(ithy=0; (ithy<4); ithy++)
123         {
124             index_xy = index_x+(j0+ithy)*pnz;
125             ty_SSE   = _mm_load1_ps(thy+ithy);
126             dy_SSE   = _mm_load1_ps(dthy+ithy);
127
128             gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
129
130             fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
131             fz1_SSE  = _mm_mul_ps(dz_SSE,gval_SSE);
132
133             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
134             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
135             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
136         }
137     }
138
139     _mm_storeu_ps(fx_tmp,fx_SSE);
140     _mm_storeu_ps(fy_tmp,fy_SSE);
141     _mm_storeu_ps(fz_tmp,fz_SSE);
142
143     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
144     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
145     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
146 }
147 #undef PME_GATHER_F_SSE_ORDER4
148 #endif
149
150
151 #ifdef PME_SPREAD_SSE_ALIGNED
152 /* This code assumes that the grid is allocated 16 bit aligned
153  * and that pnz is a multiple of 4.
154  * This code supports pme_order <= 5.
155  */
156 {
157     int offset;
158     int index;
159     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
160     __m128 tz_SSE0;                                  
161     __m128 tz_SSE1;                                  
162     __m128 vx_SSE;                                   
163     __m128 vx_tz_SSE0;                               
164     __m128 vx_tz_SSE1;                               
165     __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04; 
166     __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14; 
167     __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04; 
168     __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14; 
169                                                      
170     offset = k0 & 3;                                 
171                                                      
172     ty_SSE0 = _mm_load1_ps(&thy[0]);                 
173     ty_SSE1 = _mm_load1_ps(&thy[1]);                 
174     ty_SSE2 = _mm_load1_ps(&thy[2]);                 
175     ty_SSE3 = _mm_load1_ps(&thy[3]);                 
176 #if PME_ORDER == 5                                       
177     ty_SSE4 = _mm_load1_ps(&thy[4]);                 
178 #endif                                               
179                                                      
180     tz_SSE0 = _mm_loadu_ps(thz-offset);              
181     tz_SSE1 = _mm_loadu_ps(thz-offset+4);            
182     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]); 
183     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]); 
184
185     for(ithx=0; (ithx<PME_ORDER); ithx++)                
186     {                                                
187         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;  
188         valx  = qn*thx[ithx];                        
189                                                      
190         vx_SSE   = _mm_load1_ps(&valx);              
191                                                      
192         vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);     
193         vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);     
194                                                      
195         gri_SSE00 = _mm_load_ps(grid+index+0*pnz);   
196         gri_SSE01 = _mm_load_ps(grid+index+1*pnz);   
197         gri_SSE02 = _mm_load_ps(grid+index+2*pnz);   
198         gri_SSE03 = _mm_load_ps(grid+index+3*pnz);   
199 #if PME_ORDER == 5
200         gri_SSE04 = _mm_load_ps(grid+index+4*pnz);   
201 #endif                                               
202         gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4); 
203         gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4); 
204         gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4); 
205         gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4); 
206 #if PME_ORDER == 5
207         gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4); 
208 #endif                                               
209                                                      
210         sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0)); 
211         sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1)); 
212         sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2)); 
213         sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3)); 
214 #if PME_ORDER == 5
215         sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4)); 
216 #endif                                                                  
217         sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0)); 
218         sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1)); 
219         sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2)); 
220         sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3)); 
221 #if PME_ORDER == 5
222         sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4)); 
223 #endif      
224                                                      
225         _mm_store_ps(grid+index+0*pnz,sum_SSE00);    
226         _mm_store_ps(grid+index+1*pnz,sum_SSE01);    
227         _mm_store_ps(grid+index+2*pnz,sum_SSE02);    
228         _mm_store_ps(grid+index+3*pnz,sum_SSE03);    
229 #if PME_ORDER == 5
230         _mm_store_ps(grid+index+4*pnz,sum_SSE04);    
231 #endif    
232         _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);  
233         _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);  
234         _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);  
235         _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);  
236 #if PME_ORDER == 5
237         _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);  
238 #endif
239     }                                            
240 }
241 #undef PME_ORDER
242 #undef PME_SPREAD_SSE_ALIGNED
243 #endif
244
245
246 #ifdef PME_GATHER_F_SSE_ALIGNED
247 /* This code assumes that the grid is allocated 16 bit aligned
248  * and that pnz is a multiple of 4.
249  * This code supports pme_order <= 5.
250  */
251 {
252     int   offset;
253
254     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
255
256     __m128 fx_SSE,fy_SSE,fz_SSE;
257
258     __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
259     __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
260
261     __m128 gval_SSE0;
262     __m128 gval_SSE1;
263
264     __m128 fxy1_SSE0;
265     __m128 fz1_SSE0;
266     __m128 fxy1_SSE1;
267     __m128 fz1_SSE1;
268     __m128 fxy1_SSE;
269     __m128 fz1_SSE;
270
271     offset = k0 & 3;
272
273     fx_SSE = _mm_setzero_ps();
274     fy_SSE = _mm_setzero_ps();
275     fz_SSE = _mm_setzero_ps();
276
277     tz_SSE0 = _mm_loadu_ps(thz-offset);
278     dz_SSE0 = _mm_loadu_ps(dthz-offset);
279     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
280     dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
281     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
282     dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
283     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
284     dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
285
286     for(ithx=0; (ithx<PME_ORDER); ithx++)
287     {
288         index_x = (i0+ithx)*pny*pnz;
289         tx_SSE   = _mm_load1_ps(thx+ithx);
290         dx_SSE   = _mm_load1_ps(dthx+ithx);
291
292         for(ithy=0; (ithy<PME_ORDER); ithy++)
293         {
294             index_xy = index_x+(j0+ithy)*pnz;
295             ty_SSE   = _mm_load1_ps(thy+ithy);
296             dy_SSE   = _mm_load1_ps(dthy+ithy);
297
298             gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
299             gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
300
301             fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
302             fz1_SSE0  = _mm_mul_ps(dz_SSE0,gval_SSE0);
303             fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
304             fz1_SSE1  = _mm_mul_ps(dz_SSE1,gval_SSE1);
305
306             fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
307             fz1_SSE  = _mm_add_ps(fz1_SSE0,fz1_SSE1);
308
309             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
310             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
311             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
312         }
313     }
314
315     _mm_store_ps(fx_tmp,fx_SSE);
316     _mm_store_ps(fy_tmp,fy_SSE);
317     _mm_store_ps(fz_tmp,fz_SSE);
318
319     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
320     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
321     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
322 }
323 #undef PME_ORDER
324 #undef PME_GATHER_F_SSE_ALIGNED
325 #endif