2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* This include file has code between ifdef's to make sure
40 * that this performance sensitive code is inlined
41 * and to remove conditionals and variable loop bounds at compile time.
44 #ifdef PME_SPREAD_SSE_ORDER4
45 /* This code does not assume any memory alignment.
46 * This code only works for pme_order = 4.
49 __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3;
53 __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3;
54 __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3;
56 ty_SSE0 = _mm_load1_ps(&thy[0]);
57 ty_SSE1 = _mm_load1_ps(&thy[1]);
58 ty_SSE2 = _mm_load1_ps(&thy[2]);
59 ty_SSE3 = _mm_load1_ps(&thy[3]);
61 tz_SSE = _mm_loadu_ps(thz);
63 for (ithx = 0; (ithx < 4); ithx++)
65 index_x = (i0+ithx)*pny*pnz;
68 vx_SSE = _mm_load1_ps(&valx);
70 vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE);
72 gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
73 gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
74 gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
75 gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
77 sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0));
78 sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1));
79 sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2));
80 sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3));
82 _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0);
83 _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1);
84 _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2);
85 _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3);
88 #undef PME_SPREAD_SSE_ORDER4
92 #ifdef PME_GATHER_F_SSE_ORDER4
93 /* This code does not assume any memory alignment.
94 * This code only works for pme_order = 4.
97 float fx_tmp[4], fy_tmp[4], fz_tmp[4];
99 __m128 fx_SSE, fy_SSE, fz_SSE;
101 __m128 tx_SSE, ty_SSE, tz_SSE;
102 __m128 dx_SSE, dy_SSE, dz_SSE;
109 fx_SSE = _mm_setzero_ps();
110 fy_SSE = _mm_setzero_ps();
111 fz_SSE = _mm_setzero_ps();
113 tz_SSE = _mm_loadu_ps(thz);
114 dz_SSE = _mm_loadu_ps(dthz);
116 for (ithx = 0; (ithx < 4); ithx++)
118 index_x = (i0+ithx)*pny*pnz;
119 tx_SSE = _mm_load1_ps(thx+ithx);
120 dx_SSE = _mm_load1_ps(dthx+ithx);
122 for (ithy = 0; (ithy < 4); ithy++)
124 index_xy = index_x+(j0+ithy)*pnz;
125 ty_SSE = _mm_load1_ps(thy+ithy);
126 dy_SSE = _mm_load1_ps(dthy+ithy);
128 gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
130 fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE);
131 fz1_SSE = _mm_mul_ps(dz_SSE, gval_SSE);
133 fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
134 fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
135 fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
139 _mm_storeu_ps(fx_tmp, fx_SSE);
140 _mm_storeu_ps(fy_tmp, fy_SSE);
141 _mm_storeu_ps(fz_tmp, fz_SSE);
143 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
144 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
145 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
147 #undef PME_GATHER_F_SSE_ORDER4
151 #ifdef PME_SPREAD_SSE_ALIGNED
152 /* This code assumes that the grid is allocated 16 bit aligned
153 * and that pnz is a multiple of 4.
154 * This code supports pme_order <= 5.
159 __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4;
165 __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04;
166 __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14;
167 __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04;
168 __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14;
172 ty_SSE0 = _mm_load1_ps(&thy[0]);
173 ty_SSE1 = _mm_load1_ps(&thy[1]);
174 ty_SSE2 = _mm_load1_ps(&thy[2]);
175 ty_SSE3 = _mm_load1_ps(&thy[3]);
177 ty_SSE4 = _mm_load1_ps(&thy[4]);
180 tz_SSE0 = _mm_loadu_ps(thz-offset);
181 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
182 tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
183 tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
185 for (ithx = 0; (ithx < PME_ORDER); ithx++)
187 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
190 vx_SSE = _mm_load1_ps(&valx);
192 vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0);
193 vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1);
195 gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
196 gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
197 gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
198 gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
200 gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
202 gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
203 gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
204 gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
205 gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
207 gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
210 sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0));
211 sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1));
212 sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2));
213 sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3));
215 sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4));
217 sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0));
218 sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1));
219 sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2));
220 sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3));
222 sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4));
225 _mm_store_ps(grid+index+0*pnz, sum_SSE00);
226 _mm_store_ps(grid+index+1*pnz, sum_SSE01);
227 _mm_store_ps(grid+index+2*pnz, sum_SSE02);
228 _mm_store_ps(grid+index+3*pnz, sum_SSE03);
230 _mm_store_ps(grid+index+4*pnz, sum_SSE04);
232 _mm_store_ps(grid+index+0*pnz+4, sum_SSE10);
233 _mm_store_ps(grid+index+1*pnz+4, sum_SSE11);
234 _mm_store_ps(grid+index+2*pnz+4, sum_SSE12);
235 _mm_store_ps(grid+index+3*pnz+4, sum_SSE13);
237 _mm_store_ps(grid+index+4*pnz+4, sum_SSE14);
242 #undef PME_SPREAD_SSE_ALIGNED
246 #ifdef PME_GATHER_F_SSE_ALIGNED
247 /* This code assumes that the grid is allocated 16 bit aligned
248 * and that pnz is a multiple of 4.
249 * This code supports pme_order <= 5.
254 float fx_tmp[4], fy_tmp[4], fz_tmp[4];
256 __m128 fx_SSE, fy_SSE, fz_SSE;
258 __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1;
259 __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1;
273 fx_SSE = _mm_setzero_ps();
274 fy_SSE = _mm_setzero_ps();
275 fz_SSE = _mm_setzero_ps();
277 tz_SSE0 = _mm_loadu_ps(thz-offset);
278 dz_SSE0 = _mm_loadu_ps(dthz-offset);
279 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
280 dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
281 tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
282 dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]);
283 tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
284 dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]);
286 for (ithx = 0; (ithx < PME_ORDER); ithx++)
288 index_x = (i0+ithx)*pny*pnz;
289 tx_SSE = _mm_load1_ps(thx+ithx);
290 dx_SSE = _mm_load1_ps(dthx+ithx);
292 for (ithy = 0; (ithy < PME_ORDER); ithy++)
294 index_xy = index_x+(j0+ithy)*pnz;
295 ty_SSE = _mm_load1_ps(thy+ithy);
296 dy_SSE = _mm_load1_ps(dthy+ithy);
298 gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
299 gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
301 fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0);
302 fz1_SSE0 = _mm_mul_ps(dz_SSE0, gval_SSE0);
303 fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1);
304 fz1_SSE1 = _mm_mul_ps(dz_SSE1, gval_SSE1);
306 fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1);
307 fz1_SSE = _mm_add_ps(fz1_SSE0, fz1_SSE1);
309 fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
310 fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
311 fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
315 _mm_store_ps(fx_tmp, fx_SSE);
316 _mm_store_ps(fy_tmp, fy_SSE);
317 _mm_store_ps(fz_tmp, fz_SSE);
319 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
320 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
321 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
324 #undef PME_GATHER_F_SSE_ALIGNED