7d1623528e086949571aac3467624f76ff2b0958
[alexxy/gromacs.git] / src / mdlib / pme_sse_single.h
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  *                This source code is part of
5  * 
6  *                 G   R   O   M   A   C   S
7  * 
8  *          GROningen MAchine for Chemical Simulations
9  * 
10  *                        VERSION 4.5
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2004, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  *
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  * 
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  * 
31  * For more info, check our website at http://www.gromacs.org
32  * 
33  * And Hey:
34  * GROwing Monsters And Cloning Shrimps
35  */
36
37 /* This include file has code between ifdef's to make sure
38  * that this performance sensitive code is inlined
39  * and to remove conditionals and variable loop bounds at compile time.
40  */
41
42 #ifdef PME_SPREAD_SSE_ORDER4
43 /* This code does not assume any memory alignment.
44  * This code only works for pme_order = 4.
45  */
46 {
47     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
48     __m128 tz_SSE;
49     __m128 vx_SSE;
50     __m128 vx_tz_SSE;
51     __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
52     __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
53
54     ty_SSE0 = _mm_load1_ps(&thy[0]);
55     ty_SSE1 = _mm_load1_ps(&thy[1]);
56     ty_SSE2 = _mm_load1_ps(&thy[2]);
57     ty_SSE3 = _mm_load1_ps(&thy[3]);
58
59     tz_SSE  = _mm_loadu_ps(thz);
60
61     for(ithx=0; (ithx<4); ithx++)
62     {
63         index_x = (i0+ithx)*pny*pnz;
64         valx    = qn*thx[ithx];
65
66         vx_SSE   = _mm_load1_ps(&valx);
67
68         vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
69
70         gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
71         gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
72         gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
73         gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
74
75         sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
76         sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
77         sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
78         sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
79
80         _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
81         _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
82         _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
83         _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
84     }
85 }
86 #undef PME_SPREAD_SSE_ORDER4
87 #endif
88
89
90 #ifdef PME_GATHER_F_SSE_ORDER4
91 /* This code does not assume any memory alignment.
92  * This code only works for pme_order = 4.
93  */
94 {
95     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
96
97     __m128 fx_SSE,fy_SSE,fz_SSE;
98
99     __m128 tx_SSE,ty_SSE,tz_SSE;
100     __m128 dx_SSE,dy_SSE,dz_SSE;
101
102     __m128 gval_SSE;
103
104     __m128 fxy1_SSE;
105     __m128 fz1_SSE;
106
107     fx_SSE = _mm_setzero_ps();
108     fy_SSE = _mm_setzero_ps();
109     fz_SSE = _mm_setzero_ps();
110
111     tz_SSE  = _mm_loadu_ps(thz);
112     dz_SSE  = _mm_loadu_ps(dthz);
113
114     for(ithx=0; (ithx<4); ithx++)
115     {
116         index_x = (i0+ithx)*pny*pnz;
117         tx_SSE   = _mm_load1_ps(thx+ithx);
118         dx_SSE   = _mm_load1_ps(dthx+ithx);
119
120         for(ithy=0; (ithy<4); ithy++)
121         {
122             index_xy = index_x+(j0+ithy)*pnz;
123             ty_SSE   = _mm_load1_ps(thy+ithy);
124             dy_SSE   = _mm_load1_ps(dthy+ithy);
125
126             gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
127
128             fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
129             fz1_SSE  = _mm_mul_ps(dz_SSE,gval_SSE);
130
131             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
132             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
133             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
134         }
135     }
136
137     _mm_storeu_ps(fx_tmp,fx_SSE);
138     _mm_storeu_ps(fy_tmp,fy_SSE);
139     _mm_storeu_ps(fz_tmp,fz_SSE);
140
141     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
142     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
143     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
144 }
145 #undef PME_GATHER_F_SSE_ORDER4
146 #endif
147
148
149 #ifdef PME_SPREAD_SSE_ALIGNED
150 /* This code assumes that the grid is allocated 16 bit aligned
151  * and that pnz is a multiple of 4.
152  * This code supports pme_order <= 5.
153  */
154 {
155     int offset;
156     int index;
157     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
158     __m128 tz_SSE0;                                  
159     __m128 tz_SSE1;                                  
160     __m128 vx_SSE;                                   
161     __m128 vx_tz_SSE0;                               
162     __m128 vx_tz_SSE1;                               
163     __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04; 
164     __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14; 
165     __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04; 
166     __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14; 
167                                                      
168     offset = k0 & 3;                                 
169                                                      
170     ty_SSE0 = _mm_load1_ps(&thy[0]);                 
171     ty_SSE1 = _mm_load1_ps(&thy[1]);                 
172     ty_SSE2 = _mm_load1_ps(&thy[2]);                 
173     ty_SSE3 = _mm_load1_ps(&thy[3]);                 
174 #if PME_ORDER == 5                                       
175     ty_SSE4 = _mm_load1_ps(&thy[4]);                 
176 #endif                                               
177                                                      
178     tz_SSE0 = _mm_loadu_ps(thz-offset);              
179     tz_SSE1 = _mm_loadu_ps(thz-offset+4);            
180     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]); 
181     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]); 
182
183     for(ithx=0; (ithx<PME_ORDER); ithx++)                
184     {                                                
185         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;  
186         valx  = qn*thx[ithx];                        
187                                                      
188         vx_SSE   = _mm_load1_ps(&valx);              
189                                                      
190         vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);     
191         vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);     
192                                                      
193         gri_SSE00 = _mm_load_ps(grid+index+0*pnz);   
194         gri_SSE01 = _mm_load_ps(grid+index+1*pnz);   
195         gri_SSE02 = _mm_load_ps(grid+index+2*pnz);   
196         gri_SSE03 = _mm_load_ps(grid+index+3*pnz);   
197 #if PME_ORDER == 5
198         gri_SSE04 = _mm_load_ps(grid+index+4*pnz);   
199 #endif                                               
200         gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4); 
201         gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4); 
202         gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4); 
203         gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4); 
204 #if PME_ORDER == 5
205         gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4); 
206 #endif                                               
207                                                      
208         sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0)); 
209         sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1)); 
210         sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2)); 
211         sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3)); 
212 #if PME_ORDER == 5
213         sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4)); 
214 #endif                                                                  
215         sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0)); 
216         sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1)); 
217         sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2)); 
218         sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3)); 
219 #if PME_ORDER == 5
220         sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4)); 
221 #endif      
222                                                      
223         _mm_store_ps(grid+index+0*pnz,sum_SSE00);    
224         _mm_store_ps(grid+index+1*pnz,sum_SSE01);    
225         _mm_store_ps(grid+index+2*pnz,sum_SSE02);    
226         _mm_store_ps(grid+index+3*pnz,sum_SSE03);    
227 #if PME_ORDER == 5
228         _mm_store_ps(grid+index+4*pnz,sum_SSE04);    
229 #endif    
230         _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);  
231         _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);  
232         _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);  
233         _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);  
234 #if PME_ORDER == 5
235         _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);  
236 #endif
237     }                                            
238 }
239 #undef PME_ORDER
240 #undef PME_SPREAD_SSE_ALIGNED
241 #endif
242
243
244 #ifdef PME_GATHER_F_SSE_ALIGNED
245 /* This code assumes that the grid is allocated 16 bit aligned
246  * and that pnz is a multiple of 4.
247  * This code supports pme_order <= 5.
248  */
249 {
250     int   offset;
251
252     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
253
254     __m128 fx_SSE,fy_SSE,fz_SSE;
255
256     __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
257     __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
258
259     __m128 gval_SSE0;
260     __m128 gval_SSE1;
261
262     __m128 fxy1_SSE0;
263     __m128 fz1_SSE0;
264     __m128 fxy1_SSE1;
265     __m128 fz1_SSE1;
266     __m128 fxy1_SSE;
267     __m128 fz1_SSE;
268
269     offset = k0 & 3;
270
271     fx_SSE = _mm_setzero_ps();
272     fy_SSE = _mm_setzero_ps();
273     fz_SSE = _mm_setzero_ps();
274
275     tz_SSE0 = _mm_loadu_ps(thz-offset);
276     dz_SSE0 = _mm_loadu_ps(dthz-offset);
277     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
278     dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
279     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
280     dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
281     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
282     dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
283
284     for(ithx=0; (ithx<PME_ORDER); ithx++)
285     {
286         index_x = (i0+ithx)*pny*pnz;
287         tx_SSE   = _mm_load1_ps(thx+ithx);
288         dx_SSE   = _mm_load1_ps(dthx+ithx);
289
290         for(ithy=0; (ithy<PME_ORDER); ithy++)
291         {
292             index_xy = index_x+(j0+ithy)*pnz;
293             ty_SSE   = _mm_load1_ps(thy+ithy);
294             dy_SSE   = _mm_load1_ps(dthy+ithy);
295
296             gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
297             gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
298
299             fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
300             fz1_SSE0  = _mm_mul_ps(dz_SSE0,gval_SSE0);
301             fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
302             fz1_SSE1  = _mm_mul_ps(dz_SSE1,gval_SSE1);
303
304             fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
305             fz1_SSE  = _mm_add_ps(fz1_SSE0,fz1_SSE1);
306
307             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
308             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
309             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
310         }
311     }
312
313     _mm_store_ps(fx_tmp,fx_SSE);
314     _mm_store_ps(fy_tmp,fy_SSE);
315     _mm_store_ps(fz_tmp,fz_SSE);
316
317     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
318     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
319     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
320 }
321 #undef PME_ORDER
322 #undef PME_GATHER_F_SSE_ALIGNED
323 #endif