1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
37 /* This include file has code between ifdef's to make sure
38 * that this performance sensitive code is inlined
39 * and to remove conditionals and variable loop bounds at compile time.
42 #ifdef PME_SPREAD_SSE_ORDER4
43 /* This code does not assume any memory alignment.
44 * This code only works for pme_order = 4.
47 __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3;
51 __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3;
52 __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3;
54 ty_SSE0 = _mm_load1_ps(&thy[0]);
55 ty_SSE1 = _mm_load1_ps(&thy[1]);
56 ty_SSE2 = _mm_load1_ps(&thy[2]);
57 ty_SSE3 = _mm_load1_ps(&thy[3]);
59 tz_SSE = _mm_loadu_ps(thz);
61 for (ithx = 0; (ithx < 4); ithx++)
63 index_x = (i0+ithx)*pny*pnz;
66 vx_SSE = _mm_load1_ps(&valx);
68 vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE);
70 gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
71 gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
72 gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
73 gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
75 sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0));
76 sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1));
77 sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2));
78 sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3));
80 _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0);
81 _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1);
82 _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2);
83 _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3);
86 #undef PME_SPREAD_SSE_ORDER4
90 #ifdef PME_GATHER_F_SSE_ORDER4
91 /* This code does not assume any memory alignment.
92 * This code only works for pme_order = 4.
95 float fx_tmp[4], fy_tmp[4], fz_tmp[4];
97 __m128 fx_SSE, fy_SSE, fz_SSE;
99 __m128 tx_SSE, ty_SSE, tz_SSE;
100 __m128 dx_SSE, dy_SSE, dz_SSE;
107 fx_SSE = _mm_setzero_ps();
108 fy_SSE = _mm_setzero_ps();
109 fz_SSE = _mm_setzero_ps();
111 tz_SSE = _mm_loadu_ps(thz);
112 dz_SSE = _mm_loadu_ps(dthz);
114 for (ithx = 0; (ithx < 4); ithx++)
116 index_x = (i0+ithx)*pny*pnz;
117 tx_SSE = _mm_load1_ps(thx+ithx);
118 dx_SSE = _mm_load1_ps(dthx+ithx);
120 for (ithy = 0; (ithy < 4); ithy++)
122 index_xy = index_x+(j0+ithy)*pnz;
123 ty_SSE = _mm_load1_ps(thy+ithy);
124 dy_SSE = _mm_load1_ps(dthy+ithy);
126 gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
128 fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE);
129 fz1_SSE = _mm_mul_ps(dz_SSE, gval_SSE);
131 fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
132 fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
133 fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
137 _mm_storeu_ps(fx_tmp, fx_SSE);
138 _mm_storeu_ps(fy_tmp, fy_SSE);
139 _mm_storeu_ps(fz_tmp, fz_SSE);
141 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
142 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
143 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
145 #undef PME_GATHER_F_SSE_ORDER4
149 #ifdef PME_SPREAD_SSE_ALIGNED
150 /* This code assumes that the grid is allocated 16 bit aligned
151 * and that pnz is a multiple of 4.
152 * This code supports pme_order <= 5.
157 __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4;
163 __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04;
164 __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14;
165 __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04;
166 __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14;
170 ty_SSE0 = _mm_load1_ps(&thy[0]);
171 ty_SSE1 = _mm_load1_ps(&thy[1]);
172 ty_SSE2 = _mm_load1_ps(&thy[2]);
173 ty_SSE3 = _mm_load1_ps(&thy[3]);
175 ty_SSE4 = _mm_load1_ps(&thy[4]);
178 tz_SSE0 = _mm_loadu_ps(thz-offset);
179 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
180 tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
181 tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
183 for (ithx = 0; (ithx < PME_ORDER); ithx++)
185 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
188 vx_SSE = _mm_load1_ps(&valx);
190 vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0);
191 vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1);
193 gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
194 gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
195 gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
196 gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
198 gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
200 gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
201 gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
202 gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
203 gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
205 gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
208 sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0));
209 sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1));
210 sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2));
211 sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3));
213 sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4));
215 sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0));
216 sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1));
217 sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2));
218 sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3));
220 sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4));
223 _mm_store_ps(grid+index+0*pnz, sum_SSE00);
224 _mm_store_ps(grid+index+1*pnz, sum_SSE01);
225 _mm_store_ps(grid+index+2*pnz, sum_SSE02);
226 _mm_store_ps(grid+index+3*pnz, sum_SSE03);
228 _mm_store_ps(grid+index+4*pnz, sum_SSE04);
230 _mm_store_ps(grid+index+0*pnz+4, sum_SSE10);
231 _mm_store_ps(grid+index+1*pnz+4, sum_SSE11);
232 _mm_store_ps(grid+index+2*pnz+4, sum_SSE12);
233 _mm_store_ps(grid+index+3*pnz+4, sum_SSE13);
235 _mm_store_ps(grid+index+4*pnz+4, sum_SSE14);
240 #undef PME_SPREAD_SSE_ALIGNED
244 #ifdef PME_GATHER_F_SSE_ALIGNED
245 /* This code assumes that the grid is allocated 16 bit aligned
246 * and that pnz is a multiple of 4.
247 * This code supports pme_order <= 5.
252 float fx_tmp[4], fy_tmp[4], fz_tmp[4];
254 __m128 fx_SSE, fy_SSE, fz_SSE;
256 __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1;
257 __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1;
271 fx_SSE = _mm_setzero_ps();
272 fy_SSE = _mm_setzero_ps();
273 fz_SSE = _mm_setzero_ps();
275 tz_SSE0 = _mm_loadu_ps(thz-offset);
276 dz_SSE0 = _mm_loadu_ps(dthz-offset);
277 tz_SSE1 = _mm_loadu_ps(thz-offset+4);
278 dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
279 tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
280 dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]);
281 tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
282 dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]);
284 for (ithx = 0; (ithx < PME_ORDER); ithx++)
286 index_x = (i0+ithx)*pny*pnz;
287 tx_SSE = _mm_load1_ps(thx+ithx);
288 dx_SSE = _mm_load1_ps(dthx+ithx);
290 for (ithy = 0; (ithy < PME_ORDER); ithy++)
292 index_xy = index_x+(j0+ithy)*pnz;
293 ty_SSE = _mm_load1_ps(thy+ithy);
294 dy_SSE = _mm_load1_ps(dthy+ithy);
296 gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
297 gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
299 fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0);
300 fz1_SSE0 = _mm_mul_ps(dz_SSE0, gval_SSE0);
301 fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1);
302 fz1_SSE1 = _mm_mul_ps(dz_SSE1, gval_SSE1);
304 fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1);
305 fz1_SSE = _mm_add_ps(fz1_SSE0, fz1_SSE1);
307 fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
308 fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
309 fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
313 _mm_store_ps(fx_tmp, fx_SSE);
314 _mm_store_ps(fy_tmp, fy_SSE);
315 _mm_store_ps(fz_tmp, fz_SSE);
317 fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
318 fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
319 fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
322 #undef PME_GATHER_F_SSE_ALIGNED