2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* This include file has code between ifdef's to make sure
39 * that this performance sensitive code is inlined
40 * and to remove conditionals and variable loop bounds at compile time.
43 #ifdef PME_SPREAD_SIMD4_ORDER4
44 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
45 * This code does not assume any memory alignment for the grid.
48 gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3;
49 gmx_simd4_real_t tz_S;
50 gmx_simd4_real_t vx_S;
51 gmx_simd4_real_t vx_tz_S;
52 gmx_simd4_real_t sum_S0, sum_S1, sum_S2, sum_S3;
53 gmx_simd4_real_t gri_S0, gri_S1, gri_S2, gri_S3;
55 ty_S0 = gmx_simd4_set1_r(thy[0]);
56 ty_S1 = gmx_simd4_set1_r(thy[1]);
57 ty_S2 = gmx_simd4_set1_r(thy[2]);
58 ty_S3 = gmx_simd4_set1_r(thy[3]);
60 /* With order 4 the z-spline is actually aligned */
61 tz_S = gmx_simd4_load_r(thz);
63 for (ithx = 0; (ithx < 4); ithx++)
65 index_x = (i0+ithx)*pny*pnz;
68 vx_S = gmx_simd4_set1_r(valx);
70 vx_tz_S = gmx_simd4_mul_r(vx_S, tz_S);
72 gri_S0 = gmx_simd4_loadu_r(grid+index_x+(j0+0)*pnz+k0);
73 gri_S1 = gmx_simd4_loadu_r(grid+index_x+(j0+1)*pnz+k0);
74 gri_S2 = gmx_simd4_loadu_r(grid+index_x+(j0+2)*pnz+k0);
75 gri_S3 = gmx_simd4_loadu_r(grid+index_x+(j0+3)*pnz+k0);
77 sum_S0 = gmx_simd4_fmadd_r(vx_tz_S, ty_S0, gri_S0);
78 sum_S1 = gmx_simd4_fmadd_r(vx_tz_S, ty_S1, gri_S1);
79 sum_S2 = gmx_simd4_fmadd_r(vx_tz_S, ty_S2, gri_S2);
80 sum_S3 = gmx_simd4_fmadd_r(vx_tz_S, ty_S3, gri_S3);
82 gmx_simd4_storeu_r(grid+index_x+(j0+0)*pnz+k0, sum_S0);
83 gmx_simd4_storeu_r(grid+index_x+(j0+1)*pnz+k0, sum_S1);
84 gmx_simd4_storeu_r(grid+index_x+(j0+2)*pnz+k0, sum_S2);
85 gmx_simd4_storeu_r(grid+index_x+(j0+3)*pnz+k0, sum_S3);
88 #undef PME_SPREAD_SIMD4_ORDER4
92 #ifdef PME_GATHER_F_SIMD4_ORDER4
93 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
94 * This code does not assume any memory alignment for the grid.
97 real fx_tmp[4], fy_tmp[4], fz_tmp[4];
99 gmx_simd4_real_t fx_S, fy_S, fz_S;
101 gmx_simd4_real_t tx_S, ty_S, tz_S;
102 gmx_simd4_real_t dx_S, dy_S, dz_S;
104 gmx_simd4_real_t gval_S;
106 gmx_simd4_real_t fxy1_S;
107 gmx_simd4_real_t fz1_S;
109 fx_S = gmx_simd4_setzero_r();
110 fy_S = gmx_simd4_setzero_r();
111 fz_S = gmx_simd4_setzero_r();
113 /* With order 4 the z-spline is actually aligned */
114 tz_S = gmx_simd4_load_r(thz);
115 dz_S = gmx_simd4_load_r(dthz);
117 for (ithx = 0; (ithx < 4); ithx++)
119 index_x = (i0+ithx)*pny*pnz;
120 tx_S = gmx_simd4_set1_r(thx[ithx]);
121 dx_S = gmx_simd4_set1_r(dthx[ithx]);
123 for (ithy = 0; (ithy < 4); ithy++)
125 index_xy = index_x+(j0+ithy)*pnz;
126 ty_S = gmx_simd4_set1_r(thy[ithy]);
127 dy_S = gmx_simd4_set1_r(dthy[ithy]);
129 gval_S = gmx_simd4_loadu_r(grid+index_xy+k0);
131 fxy1_S = gmx_simd4_mul_r(tz_S, gval_S);
132 fz1_S = gmx_simd4_mul_r(dz_S, gval_S);
134 fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
135 fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
136 fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
140 gmx_simd4_storeu_r(fx_tmp, fx_S);
141 gmx_simd4_storeu_r(fy_tmp, fy_S);
142 gmx_simd4_storeu_r(fz_tmp, fz_S);
144 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
145 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
146 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
148 #undef PME_GATHER_F_SIMD4_ORDER4
152 #ifdef PME_SPREAD_SIMD4_ALIGNED
153 /* This code assumes that the grid is allocated 4-real aligned
154 * and that pnz is a multiple of 4.
155 * This code supports pme_order <= 5.
160 gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
161 gmx_simd4_real_t tz_S0;
162 gmx_simd4_real_t tz_S1;
163 gmx_simd4_real_t vx_S;
164 gmx_simd4_real_t vx_tz_S0;
165 gmx_simd4_real_t vx_tz_S1;
166 gmx_simd4_real_t sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
167 gmx_simd4_real_t sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
168 gmx_simd4_real_t gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
169 gmx_simd4_real_t gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
173 ty_S0 = gmx_simd4_set1_r(thy[0]);
174 ty_S1 = gmx_simd4_set1_r(thy[1]);
175 ty_S2 = gmx_simd4_set1_r(thy[2]);
176 ty_S3 = gmx_simd4_set1_r(thy[3]);
178 ty_S4 = gmx_simd4_set1_r(thy[4]);
181 #ifdef GMX_SIMD4_HAVE_UNALIGNED
182 tz_S0 = gmx_simd4_loadu_r(thz-offset);
183 tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
187 /* Copy thz to an aligned buffer (unused buffer parts are masked) */
188 for (i = 0; i < PME_ORDER; i++)
190 thz_aligned[offset+i] = thz[i];
192 tz_S0 = gmx_simd4_load_r(thz_aligned);
193 tz_S1 = gmx_simd4_load_r(thz_aligned+4);
196 tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
197 tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
199 for (ithx = 0; (ithx < PME_ORDER); ithx++)
201 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
204 vx_S = gmx_simd4_set1_r(valx);
206 vx_tz_S0 = gmx_simd4_mul_r(vx_S, tz_S0);
207 vx_tz_S1 = gmx_simd4_mul_r(vx_S, tz_S1);
209 gri_S00 = gmx_simd4_load_r(grid+index+0*pnz);
210 gri_S01 = gmx_simd4_load_r(grid+index+1*pnz);
211 gri_S02 = gmx_simd4_load_r(grid+index+2*pnz);
212 gri_S03 = gmx_simd4_load_r(grid+index+3*pnz);
214 gri_S04 = gmx_simd4_load_r(grid+index+4*pnz);
216 gri_S10 = gmx_simd4_load_r(grid+index+0*pnz+4);
217 gri_S11 = gmx_simd4_load_r(grid+index+1*pnz+4);
218 gri_S12 = gmx_simd4_load_r(grid+index+2*pnz+4);
219 gri_S13 = gmx_simd4_load_r(grid+index+3*pnz+4);
221 gri_S14 = gmx_simd4_load_r(grid+index+4*pnz+4);
224 sum_S00 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S0, gri_S00);
225 sum_S01 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S1, gri_S01);
226 sum_S02 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S2, gri_S02);
227 sum_S03 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S3, gri_S03);
229 sum_S04 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S4, gri_S04);
231 sum_S10 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S0, gri_S10);
232 sum_S11 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S1, gri_S11);
233 sum_S12 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S2, gri_S12);
234 sum_S13 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S3, gri_S13);
236 sum_S14 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S4, gri_S14);
239 gmx_simd4_store_r(grid+index+0*pnz, sum_S00);
240 gmx_simd4_store_r(grid+index+1*pnz, sum_S01);
241 gmx_simd4_store_r(grid+index+2*pnz, sum_S02);
242 gmx_simd4_store_r(grid+index+3*pnz, sum_S03);
244 gmx_simd4_store_r(grid+index+4*pnz, sum_S04);
246 gmx_simd4_store_r(grid+index+0*pnz+4, sum_S10);
247 gmx_simd4_store_r(grid+index+1*pnz+4, sum_S11);
248 gmx_simd4_store_r(grid+index+2*pnz+4, sum_S12);
249 gmx_simd4_store_r(grid+index+3*pnz+4, sum_S13);
251 gmx_simd4_store_r(grid+index+4*pnz+4, sum_S14);
256 #undef PME_SPREAD_SIMD4_ALIGNED
260 #ifdef PME_GATHER_F_SIMD4_ALIGNED
261 /* This code assumes that the grid is allocated 4-real aligned
262 * and that pnz is a multiple of 4.
263 * This code supports pme_order <= 5.
268 real fx_tmp[4], fy_tmp[4], fz_tmp[4];
270 gmx_simd4_real_t fx_S, fy_S, fz_S;
272 gmx_simd4_real_t tx_S, ty_S, tz_S0, tz_S1;
273 gmx_simd4_real_t dx_S, dy_S, dz_S0, dz_S1;
275 gmx_simd4_real_t gval_S0;
276 gmx_simd4_real_t gval_S1;
278 gmx_simd4_real_t fxy1_S0;
279 gmx_simd4_real_t fz1_S0;
280 gmx_simd4_real_t fxy1_S1;
281 gmx_simd4_real_t fz1_S1;
282 gmx_simd4_real_t fxy1_S;
283 gmx_simd4_real_t fz1_S;
287 fx_S = gmx_simd4_setzero_r();
288 fy_S = gmx_simd4_setzero_r();
289 fz_S = gmx_simd4_setzero_r();
291 #ifdef GMX_SIMD4_HAVE_UNALIGNED
292 tz_S0 = gmx_simd4_loadu_r(thz-offset);
293 tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
294 dz_S0 = gmx_simd4_loadu_r(dthz-offset);
295 dz_S1 = gmx_simd4_loadu_r(dthz-offset+4);
299 /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
300 for (i = 0; i < PME_ORDER; i++)
302 thz_aligned[offset+i] = thz[i];
303 dthz_aligned[offset+i] = dthz[i];
305 tz_S0 = gmx_simd4_load_r(thz_aligned);
306 tz_S1 = gmx_simd4_load_r(thz_aligned+4);
307 dz_S0 = gmx_simd4_load_r(dthz_aligned);
308 dz_S1 = gmx_simd4_load_r(dthz_aligned+4);
311 tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
312 dz_S0 = gmx_simd4_blendzero_r(dz_S0, work->mask_S0[offset]);
313 tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
314 dz_S1 = gmx_simd4_blendzero_r(dz_S1, work->mask_S1[offset]);
316 for (ithx = 0; (ithx < PME_ORDER); ithx++)
318 index_x = (i0+ithx)*pny*pnz;
319 tx_S = gmx_simd4_set1_r(thx[ithx]);
320 dx_S = gmx_simd4_set1_r(dthx[ithx]);
322 for (ithy = 0; (ithy < PME_ORDER); ithy++)
324 index_xy = index_x+(j0+ithy)*pnz;
325 ty_S = gmx_simd4_set1_r(thy[ithy]);
326 dy_S = gmx_simd4_set1_r(dthy[ithy]);
328 gval_S0 = gmx_simd4_load_r(grid+index_xy+k0-offset);
329 gval_S1 = gmx_simd4_load_r(grid+index_xy+k0-offset+4);
331 fxy1_S0 = gmx_simd4_mul_r(tz_S0, gval_S0);
332 fz1_S0 = gmx_simd4_mul_r(dz_S0, gval_S0);
333 fxy1_S1 = gmx_simd4_mul_r(tz_S1, gval_S1);
334 fz1_S1 = gmx_simd4_mul_r(dz_S1, gval_S1);
336 fxy1_S = gmx_simd4_add_r(fxy1_S0, fxy1_S1);
337 fz1_S = gmx_simd4_add_r(fz1_S0, fz1_S1);
339 fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
340 fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
341 fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
345 gmx_simd4_store_r(fx_tmp, fx_S);
346 gmx_simd4_store_r(fy_tmp, fy_S);
347 gmx_simd4_store_r(fz_tmp, fz_S);
349 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
350 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
351 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
354 #undef PME_GATHER_F_SIMD4_ALIGNED