2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* This include file has code between ifdef's to make sure
40 * that this performance sensitive code is inlined
41 * and to remove conditionals and variable loop bounds at compile time.
44 #ifdef PME_SPREAD_SSE_ORDER4
45 /* This code does not assume any memory alignment.
46 * This code only works for pme_order = 4.
49 __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
53 __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
54 __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
56 ty_SSE0 = _mm_load1_ps(&thy[0]);
57 ty_SSE1 = _mm_load1_ps(&thy[1]);
58 ty_SSE2 = _mm_load1_ps(&thy[2]);
59 ty_SSE3 = _mm_load1_ps(&thy[3]);
61 tz_SSE = _mm_loadu_ps(thz);
63 for(ithx=0; (ithx<4); ithx++)
65 index_x = (i0+ithx)*pny*pnz;
68 vx_SSE = _mm_load1_ps(&valx);
70 vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
72 gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
73 gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
74 gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
75 gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
77 sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
78 sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
79 sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
80 sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
82 _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
83 _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
84 _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
85 _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
88 #undef PME_SPREAD_SSE_ORDER4
92 #ifdef PME_GATHER_F_SSE_ORDER4
93 /* This code does not assume any memory alignment.
94 * This code only works for pme_order = 4.
97 float fx_tmp[4],fy_tmp[4],fz_tmp[4];
99 __m128 fx_SSE,fy_SSE,fz_SSE;
101 __m128 tx_SSE,ty_SSE,tz_SSE;
102 __m128 dx_SSE,dy_SSE,dz_SSE;
109 fx_SSE = _mm_setzero_ps();
110 fy_SSE = _mm_setzero_ps();
111 fz_SSE = _mm_setzero_ps();
113 tz_SSE = _mm_loadu_ps(thz);
114 dz_SSE = _mm_loadu_ps(dthz);
116 for(ithx=0; (ithx<4); ithx++)
118 index_x = (i0+ithx)*pny*pnz;
119 tx_SSE = _mm_load1_ps(thx+ithx);
120 dx_SSE = _mm_load1_ps(dthx+ithx);
122 for(ithy=0; (ithy<4); ithy++)
124 index_xy = index_x+(j0+ithy)*pnz;
125 ty_SSE = _mm_load1_ps(thy+ithy);
126 dy_SSE = _mm_load1_ps(dthy+ithy);
128 gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
130 fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
131 fz1_SSE = _mm_mul_ps(dz_SSE,gval_SSE);
133 fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
134 fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
135 fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
139 _mm_storeu_ps(fx_tmp,fx_SSE);
140 _mm_storeu_ps(fy_tmp,fy_SSE);
141 _mm_storeu_ps(fz_tmp,fz_SSE);
143 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
144 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
145 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
147 #undef PME_GATHER_F_SSE_ORDER4
151 #ifdef PME_SPREAD_SSE_ALIGNED
152 /* This code assumes that the grid is allocated 16 bit aligned
153 * and that pnz is a multiple of 4.
154 * This code supports pme_order <= 5.
159 __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
165 __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04;
166 __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14;
167 __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04;
168 __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14;
172 ty_SSE0 = _mm_load1_ps(&thy[0]);
173 ty_SSE1 = _mm_load1_ps(&thy[1]);
174 ty_SSE2 = _mm_load1_ps(&thy[2]);
175 ty_SSE3 = _mm_load1_ps(&thy[3]);
177 ty_SSE4 = _mm_load1_ps(&thy[4]);
180 tz_SSE0 = _mm_loadu_ps(thz-offset);
181 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
182 tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
183 tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
185 for(ithx=0; (ithx<PME_ORDER); ithx++)
187 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
190 vx_SSE = _mm_load1_ps(&valx);
192 vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);
193 vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);
195 gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
196 gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
197 gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
198 gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
200 gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
202 gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
203 gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
204 gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
205 gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
207 gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
210 sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0));
211 sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1));
212 sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2));
213 sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3));
215 sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4));
217 sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0));
218 sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1));
219 sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2));
220 sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3));
222 sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4));
225 _mm_store_ps(grid+index+0*pnz,sum_SSE00);
226 _mm_store_ps(grid+index+1*pnz,sum_SSE01);
227 _mm_store_ps(grid+index+2*pnz,sum_SSE02);
228 _mm_store_ps(grid+index+3*pnz,sum_SSE03);
230 _mm_store_ps(grid+index+4*pnz,sum_SSE04);
232 _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);
233 _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);
234 _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);
235 _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);
237 _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);
242 #undef PME_SPREAD_SSE_ALIGNED
246 #ifdef PME_GATHER_F_SSE_ALIGNED
247 /* This code assumes that the grid is allocated 16 bit aligned
248 * and that pnz is a multiple of 4.
249 * This code supports pme_order <= 5.
254 float fx_tmp[4],fy_tmp[4],fz_tmp[4];
256 __m128 fx_SSE,fy_SSE,fz_SSE;
258 __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
259 __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
273 fx_SSE = _mm_setzero_ps();
274 fy_SSE = _mm_setzero_ps();
275 fz_SSE = _mm_setzero_ps();
277 tz_SSE0 = _mm_loadu_ps(thz-offset);
278 dz_SSE0 = _mm_loadu_ps(dthz-offset);
279 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
280 dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
281 tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
282 dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
283 tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
284 dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
286 for(ithx=0; (ithx<PME_ORDER); ithx++)
288 index_x = (i0+ithx)*pny*pnz;
289 tx_SSE = _mm_load1_ps(thx+ithx);
290 dx_SSE = _mm_load1_ps(dthx+ithx);
292 for(ithy=0; (ithy<PME_ORDER); ithy++)
294 index_xy = index_x+(j0+ithy)*pnz;
295 ty_SSE = _mm_load1_ps(thy+ithy);
296 dy_SSE = _mm_load1_ps(dthy+ithy);
298 gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
299 gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
301 fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
302 fz1_SSE0 = _mm_mul_ps(dz_SSE0,gval_SSE0);
303 fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
304 fz1_SSE1 = _mm_mul_ps(dz_SSE1,gval_SSE1);
306 fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
307 fz1_SSE = _mm_add_ps(fz1_SSE0,fz1_SSE1);
309 fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
310 fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
311 fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
315 _mm_store_ps(fx_tmp,fx_SSE);
316 _mm_store_ps(fy_tmp,fy_SSE);
317 _mm_store_ps(fz_tmp,fz_SSE);
319 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
320 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
321 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
324 #undef PME_GATHER_F_SSE_ALIGNED