2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* This include file has code between ifdef's to make sure
40 * that this performance sensitive code is inlined
41 * and to remove conditionals and variable loop bounds at compile time.
44 #ifdef PME_SPREAD_SIMD4_ORDER4
45 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
46 * This code does not assume any memory alignment for the grid.
49 gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3;
53 gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3;
54 gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3;
56 ty_S0 = gmx_simd4_set1_pr(thy[0]);
57 ty_S1 = gmx_simd4_set1_pr(thy[1]);
58 ty_S2 = gmx_simd4_set1_pr(thy[2]);
59 ty_S3 = gmx_simd4_set1_pr(thy[3]);
61 /* With order 4 the z-spline is actually aligned */
62 tz_S = gmx_simd4_load_pr(thz);
64 for (ithx = 0; (ithx < 4); ithx++)
66 index_x = (i0+ithx)*pny*pnz;
69 vx_S = gmx_simd4_set1_pr(valx);
71 vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S);
73 gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0);
74 gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0);
75 gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0);
76 gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0);
78 sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0);
79 sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1);
80 sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2);
81 sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3);
83 gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0);
84 gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1);
85 gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2);
86 gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3);
89 #undef PME_SPREAD_SIMD4_ORDER4
93 #ifdef PME_GATHER_F_SIMD4_ORDER4
94 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
95 * This code does not assume any memory alignment for the grid.
98 real fx_tmp[4], fy_tmp[4], fz_tmp[4];
100 gmx_simd4_pr fx_S, fy_S, fz_S;
102 gmx_simd4_pr tx_S, ty_S, tz_S;
103 gmx_simd4_pr dx_S, dy_S, dz_S;
110 fx_S = gmx_simd4_setzero_pr();
111 fy_S = gmx_simd4_setzero_pr();
112 fz_S = gmx_simd4_setzero_pr();
114 /* With order 4 the z-spline is actually aligned */
115 tz_S = gmx_simd4_load_pr(thz);
116 dz_S = gmx_simd4_load_pr(dthz);
118 for (ithx = 0; (ithx < 4); ithx++)
120 index_x = (i0+ithx)*pny*pnz;
121 tx_S = gmx_simd4_set1_pr(thx[ithx]);
122 dx_S = gmx_simd4_set1_pr(dthx[ithx]);
124 for (ithy = 0; (ithy < 4); ithy++)
126 index_xy = index_x+(j0+ithy)*pnz;
127 ty_S = gmx_simd4_set1_pr(thy[ithy]);
128 dy_S = gmx_simd4_set1_pr(dthy[ithy]);
130 gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
132 fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S);
133 fz1_S = gmx_simd4_mul_pr(dz_S, gval_S);
135 fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
136 fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
137 fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
141 gmx_simd4_storeu_pr(fx_tmp, fx_S);
142 gmx_simd4_storeu_pr(fy_tmp, fy_S);
143 gmx_simd4_storeu_pr(fz_tmp, fz_S);
145 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
146 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
147 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
149 #undef PME_GATHER_F_SIMD4_ORDER4
153 #ifdef PME_SPREAD_SIMD4_ALIGNED
154 /* This code assumes that the grid is allocated 4-real aligned
155 * and that pnz is a multiple of 4.
156 * This code supports pme_order <= 5.
161 gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
165 gmx_simd4_pr vx_tz_S0;
166 gmx_simd4_pr vx_tz_S1;
167 gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
168 gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
169 gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
170 gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
174 ty_S0 = gmx_simd4_set1_pr(thy[0]);
175 ty_S1 = gmx_simd4_set1_pr(thy[1]);
176 ty_S2 = gmx_simd4_set1_pr(thy[2]);
177 ty_S3 = gmx_simd4_set1_pr(thy[3]);
179 ty_S4 = gmx_simd4_set1_pr(thy[4]);
182 #ifdef GMX_SIMD4_HAVE_UNALIGNED
183 tz_S0 = gmx_simd4_loadu_pr(thz-offset);
184 tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
188 /* Copy thz to an aligned buffer (unused buffer parts are masked) */
189 for (i = 0; i < PME_ORDER; i++)
191 thz_aligned[offset+i] = thz[i];
193 tz_S0 = gmx_simd4_load_pr(thz_aligned);
194 tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
197 tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
198 tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
200 for (ithx = 0; (ithx < PME_ORDER); ithx++)
202 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
205 vx_S = gmx_simd4_set1_pr(valx);
207 vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0);
208 vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1);
210 gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz);
211 gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz);
212 gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz);
213 gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz);
215 gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz);
217 gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4);
218 gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4);
219 gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4);
220 gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4);
222 gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4);
225 sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00);
226 sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01);
227 sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02);
228 sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03);
230 sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04);
232 sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10);
233 sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11);
234 sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12);
235 sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13);
237 sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14);
240 gmx_simd4_store_pr(grid+index+0*pnz, sum_S00);
241 gmx_simd4_store_pr(grid+index+1*pnz, sum_S01);
242 gmx_simd4_store_pr(grid+index+2*pnz, sum_S02);
243 gmx_simd4_store_pr(grid+index+3*pnz, sum_S03);
245 gmx_simd4_store_pr(grid+index+4*pnz, sum_S04);
247 gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10);
248 gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11);
249 gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12);
250 gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13);
252 gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14);
257 #undef PME_SPREAD_SIMD4_ALIGNED
261 #ifdef PME_GATHER_F_SIMD4_ALIGNED
262 /* This code assumes that the grid is allocated 4-real aligned
263 * and that pnz is a multiple of 4.
264 * This code supports pme_order <= 5.
269 real fx_tmp[4], fy_tmp[4], fz_tmp[4];
271 gmx_simd4_pr fx_S, fy_S, fz_S;
273 gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1;
274 gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1;
276 gmx_simd4_pr gval_S0;
277 gmx_simd4_pr gval_S1;
279 gmx_simd4_pr fxy1_S0;
281 gmx_simd4_pr fxy1_S1;
288 fx_S = gmx_simd4_setzero_pr();
289 fy_S = gmx_simd4_setzero_pr();
290 fz_S = gmx_simd4_setzero_pr();
292 #ifdef GMX_SIMD4_HAVE_UNALIGNED
293 tz_S0 = gmx_simd4_loadu_pr(thz-offset);
294 tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
295 dz_S0 = gmx_simd4_loadu_pr(dthz-offset);
296 dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4);
300 /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
301 for (i = 0; i < PME_ORDER; i++)
303 thz_aligned[offset+i] = thz[i];
304 dthz_aligned[offset+i] = dthz[i];
306 tz_S0 = gmx_simd4_load_pr(thz_aligned);
307 tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
308 dz_S0 = gmx_simd4_load_pr(dthz_aligned);
309 dz_S1 = gmx_simd4_load_pr(dthz_aligned+4);
312 tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
313 dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]);
314 tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
315 dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]);
317 for (ithx = 0; (ithx < PME_ORDER); ithx++)
319 index_x = (i0+ithx)*pny*pnz;
320 tx_S = gmx_simd4_set1_pr(thx[ithx]);
321 dx_S = gmx_simd4_set1_pr(dthx[ithx]);
323 for (ithy = 0; (ithy < PME_ORDER); ithy++)
325 index_xy = index_x+(j0+ithy)*pnz;
326 ty_S = gmx_simd4_set1_pr(thy[ithy]);
327 dy_S = gmx_simd4_set1_pr(dthy[ithy]);
329 gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
330 gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
332 fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0);
333 fz1_S0 = gmx_simd4_mul_pr(dz_S0, gval_S0);
334 fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1);
335 fz1_S1 = gmx_simd4_mul_pr(dz_S1, gval_S1);
337 fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1);
338 fz1_S = gmx_simd4_add_pr(fz1_S0, fz1_S1);
340 fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
341 fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
342 fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
346 gmx_simd4_store_pr(fx_tmp, fx_S);
347 gmx_simd4_store_pr(fy_tmp, fy_S);
348 gmx_simd4_store_pr(fz_tmp, fz_S);
350 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
351 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
352 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
355 #undef PME_GATHER_F_SIMD4_ALIGNED