2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_sparc64_hpc_ace_double_h_
36 #define _kernelutil_sparc64_hpc_ace_double_h_
38 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
39 #include <emmintrin.h>
41 #define GMX_FJSP_SHUFFLE2(x, y) (((x)<<1) | (y))
43 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) { \
44 _fjsp_v2r8 __gmx_t1 = row0; \
45 row0 = _fjsp_unpacklo_v2r8(row0, row1); \
46 row1 = _fjsp_unpackhi_v2r8(__gmx_t1, row1); \
51 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
55 _fjsp_storel_v2r8(&lo, a);
56 _fjsp_storeh_v2r8(&hi, a);
57 printf("%s: %g %g\n", s, lo, hi);
62 gmx_fjsp_set1_v2r8(double d)
64 return _fjsp_set_v2r8(d, d);
68 gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
70 return gmx_fjsp_set1_v2r8(*ptr);
75 gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
84 a = _fjsp_cmplt_v2r8(a, b);
85 a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a, a));
86 _fjsp_storel_v2r8(&(conv.d), a);
91 static gmx_inline _fjsp_v2r8
92 gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
94 const _fjsp_v2r8 half = gmx_fjsp_set1_v2r8(0.5);
95 const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
96 _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
98 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
99 /* The HPC-ACE instruction set is only available in double precision, while
100 * single precision is typically sufficient for Gromacs. If you define
101 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
102 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
103 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
105 #ifndef GMX_RELAXED_DOUBLE_PRECISION
106 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
108 return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
113 static gmx_inline _fjsp_v2r8
114 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
116 const _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
117 __m128d lu = _fjsp_rcpa_v2r8(x);
119 /* Perform three N-R steps for double precision */
120 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
121 /* The HPC-ACE instruction set is only available in double precision, while
122 * single precision is typically sufficient for Gromacs. If you define
123 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
124 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
125 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
127 #ifndef GMX_RELAXED_DOUBLE_PRECISION
128 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
130 return _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
134 static gmx_inline _fjsp_v2r8
135 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
137 return _fjsp_madd_v2r8(dx, dx, _fjsp_madd_v2r8(dy, dy, _fjsp_mul_v2r8(dz, dz)));
140 /* Normal sum of four ymm registers */
141 #define gmx_fjsp_sum4_v2r8(t0, t1, t2, t3) _fjsp_add_v2r8(_fjsp_add_v2r8(t0, t1), _fjsp_add_v2r8(t2, t3))
148 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
149 const double * gmx_restrict ptrB)
151 return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA), _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
155 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
157 return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
162 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
163 double * gmx_restrict ptrB,
168 t2 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
169 _fjsp_storel_v2r8(ptrA, xmm1);
170 _fjsp_storel_v2r8(ptrB, t2);
174 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
176 _fjsp_storel_v2r8(ptrA, xmm1);
180 /* Similar to store, but increments value in memory */
182 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
183 double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
187 t1 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
188 xmm1 = _fjsp_add_v2r8(xmm1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA));
189 t1 = _fjsp_add_v2r8(t1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
190 _fjsp_storel_v2r8(ptrA, xmm1);
191 _fjsp_storel_v2r8(ptrB, t1);
195 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
199 tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
200 tmp = _fjsp_add_v2r8(tmp, xmm1);
201 _fjsp_storel_v2r8(ptrA, tmp);
206 static gmx_inline void
207 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
208 const double * gmx_restrict p2,
209 _fjsp_v2r8 * gmx_restrict c6,
210 _fjsp_v2r8 * gmx_restrict c12)
212 _fjsp_v2r8 t1, t2, t3;
214 /* The c6/c12 array should be aligned */
215 t1 = _fjsp_load_v2r8(p1);
216 t2 = _fjsp_load_v2r8(p2);
217 *c6 = _fjsp_unpacklo_v2r8(t1, t2);
218 *c12 = _fjsp_unpackhi_v2r8(t1, t2);
221 static gmx_inline void
222 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
223 _fjsp_v2r8 * gmx_restrict c6,
224 _fjsp_v2r8 * gmx_restrict c12)
226 *c6 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
227 *c12 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
231 static gmx_inline void
232 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
233 const double * gmx_restrict xyz,
234 _fjsp_v2r8 * gmx_restrict x1,
235 _fjsp_v2r8 * gmx_restrict y1,
236 _fjsp_v2r8 * gmx_restrict z1)
238 _fjsp_v2r8 mem_xy, mem_z, mem_sxy, mem_sz;
240 mem_xy = _fjsp_load_v2r8(xyz);
241 mem_z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+2);
242 mem_sxy = _fjsp_load_v2r8(xyz_shift);
243 mem_sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
245 mem_xy = _fjsp_add_v2r8(mem_xy, mem_sxy);
246 mem_z = _fjsp_add_v2r8(mem_z, mem_sz);
248 *x1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(0, 0));
249 *y1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(1, 1));
250 *z1 = _fjsp_shuffle_v2r8(mem_z, mem_z, GMX_FJSP_SHUFFLE2(0, 0));
254 static gmx_inline void
255 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
256 const double * gmx_restrict xyz,
257 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
258 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
259 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
261 _fjsp_v2r8 t1, t2, t3, t4, t5, sxy, sz, szx, syz;
263 t1 = _fjsp_load_v2r8(xyz);
264 t2 = _fjsp_load_v2r8(xyz+2);
265 t3 = _fjsp_load_v2r8(xyz+4);
266 t4 = _fjsp_load_v2r8(xyz+6);
267 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+8);
269 sxy = _fjsp_load_v2r8(xyz_shift);
270 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
271 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
272 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
274 t1 = _fjsp_add_v2r8(t1, sxy);
275 t2 = _fjsp_add_v2r8(t2, szx);
276 t3 = _fjsp_add_v2r8(t3, syz);
277 t4 = _fjsp_add_v2r8(t4, sxy);
278 t5 = _fjsp_add_v2r8(t5, sz);
280 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
281 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
282 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
283 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
284 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
285 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
286 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
287 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
288 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
292 static gmx_inline void
293 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
294 const double * gmx_restrict xyz,
295 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
296 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
297 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
298 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
300 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
302 t1 = _fjsp_load_v2r8(xyz);
303 t2 = _fjsp_load_v2r8(xyz+2);
304 t3 = _fjsp_load_v2r8(xyz+4);
305 t4 = _fjsp_load_v2r8(xyz+6);
306 t5 = _fjsp_load_v2r8(xyz+8);
307 t6 = _fjsp_load_v2r8(xyz+10);
309 sxy = _fjsp_load_v2r8(xyz_shift);
310 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
311 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
312 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
314 t1 = _fjsp_add_v2r8(t1, sxy);
315 t2 = _fjsp_add_v2r8(t2, szx);
316 t3 = _fjsp_add_v2r8(t3, syz);
317 t4 = _fjsp_add_v2r8(t4, sxy);
318 t5 = _fjsp_add_v2r8(t5, szx);
319 t6 = _fjsp_add_v2r8(t6, syz);
321 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
322 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
323 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
324 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
325 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
326 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
327 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
328 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
329 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
330 *x4 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(1, 1));
331 *y4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(0, 0));
332 *z4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(1, 1));
337 static gmx_inline void
338 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
339 _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
341 *x = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
342 *y = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
343 *z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
346 static gmx_inline void
347 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
348 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
349 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
350 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
352 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
353 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
354 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
355 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
356 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
357 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
358 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
359 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
360 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
363 static gmx_inline void
364 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
365 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
366 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
367 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
368 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
370 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
371 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
372 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
373 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
374 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
375 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
376 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
377 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
378 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
379 *x4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+9);
380 *y4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+10);
381 *z4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+11);
385 static gmx_inline void
386 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
387 const double * gmx_restrict ptrB,
388 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
390 _fjsp_v2r8 t1, t2, t3, t4;
391 t1 = _fjsp_load_v2r8(ptrA);
392 t2 = _fjsp_load_v2r8(ptrB);
393 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
394 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
395 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
398 *z1 = _fjsp_unpacklo_v2r8(t3, t4);
401 static gmx_inline void
402 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
403 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
404 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
405 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
407 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
408 t1 = _fjsp_load_v2r8(ptrA);
409 t2 = _fjsp_load_v2r8(ptrB);
410 t3 = _fjsp_load_v2r8(ptrA+2);
411 t4 = _fjsp_load_v2r8(ptrB+2);
412 t5 = _fjsp_load_v2r8(ptrA+4);
413 t6 = _fjsp_load_v2r8(ptrB+4);
414 t7 = _fjsp_load_v2r8(ptrA+6);
415 t8 = _fjsp_load_v2r8(ptrB+6);
416 t9 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
417 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
418 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
419 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
420 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
421 GMX_FJSP_TRANSPOSE2_V2R8(t7, t8);
430 *z3 = _fjsp_unpacklo_v2r8(t9, t10);
434 static gmx_inline void
435 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
436 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
437 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
438 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
439 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
441 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
442 t1 = _fjsp_load_v2r8(ptrA);
443 t2 = _fjsp_load_v2r8(ptrB);
444 t3 = _fjsp_load_v2r8(ptrA+2);
445 t4 = _fjsp_load_v2r8(ptrB+2);
446 t5 = _fjsp_load_v2r8(ptrA+4);
447 t6 = _fjsp_load_v2r8(ptrB+4);
448 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
449 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
450 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
457 t1 = _fjsp_load_v2r8(ptrA+6);
458 t2 = _fjsp_load_v2r8(ptrB+6);
459 t3 = _fjsp_load_v2r8(ptrA+8);
460 t4 = _fjsp_load_v2r8(ptrB+8);
461 t5 = _fjsp_load_v2r8(ptrA+10);
462 t6 = _fjsp_load_v2r8(ptrB+10);
463 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
464 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
465 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
476 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
477 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
479 _fjsp_v2r8 t1, t2, t3;
481 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
482 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
483 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
485 t1 = _fjsp_sub_v2r8(t1, x1);
486 t2 = _fjsp_sub_v2r8(t2, y1);
487 t3 = _fjsp_sub_v2r8(t3, z1);
488 _fjsp_storel_v2r8(ptrA, t1);
489 _fjsp_storel_v2r8(ptrA+1, t2);
490 _fjsp_storel_v2r8(ptrA+2, t3);
494 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
495 _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
497 _fjsp_v2r8 t1, t2, t3;
499 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
500 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
501 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
503 t1 = _fjsp_nmsub_v2r8(fscal, dx1, t1);
504 t2 = _fjsp_nmsub_v2r8(fscal, dy1, t2);
505 t3 = _fjsp_nmsub_v2r8(fscal, dz1, t3);
506 _fjsp_storel_v2r8(ptrA, t1);
507 _fjsp_storel_v2r8(ptrA+1, t2);
508 _fjsp_storel_v2r8(ptrA+2, t3);
513 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
514 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
515 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
516 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
518 _fjsp_v2r8 t1, t2, t3, t4, t5;
520 t1 = _fjsp_load_v2r8(ptrA);
521 t2 = _fjsp_load_v2r8(ptrA+2);
522 t3 = _fjsp_load_v2r8(ptrA+4);
523 t4 = _fjsp_load_v2r8(ptrA+6);
524 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
526 x1 = _fjsp_unpacklo_v2r8(x1, y1);
527 z1 = _fjsp_unpacklo_v2r8(z1, x2);
528 y2 = _fjsp_unpacklo_v2r8(y2, z2);
529 x3 = _fjsp_unpacklo_v2r8(x3, y3);
530 /* nothing to be done for z3 */
532 t1 = _fjsp_sub_v2r8(t1, x1);
533 t2 = _fjsp_sub_v2r8(t2, z1);
534 t3 = _fjsp_sub_v2r8(t3, y2);
535 t4 = _fjsp_sub_v2r8(t4, x3);
536 t5 = _fjsp_sub_v2r8(t5, z3);
537 _fjsp_storel_v2r8(ptrA, t1);
538 _fjsp_storeh_v2r8(ptrA+1, t1);
539 _fjsp_storel_v2r8(ptrA+2, t2);
540 _fjsp_storeh_v2r8(ptrA+3, t2);
541 _fjsp_storel_v2r8(ptrA+4, t3);
542 _fjsp_storeh_v2r8(ptrA+5, t3);
543 _fjsp_storel_v2r8(ptrA+6, t4);
544 _fjsp_storeh_v2r8(ptrA+7, t4);
545 _fjsp_storel_v2r8(ptrA+8, t5);
550 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
551 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
552 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
553 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
554 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
556 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
558 t1 = _fjsp_load_v2r8(ptrA);
559 t2 = _fjsp_load_v2r8(ptrA+2);
560 t3 = _fjsp_load_v2r8(ptrA+4);
561 t4 = _fjsp_load_v2r8(ptrA+6);
562 t5 = _fjsp_load_v2r8(ptrA+8);
563 t6 = _fjsp_load_v2r8(ptrA+10);
565 x1 = _fjsp_unpacklo_v2r8(x1, y1);
566 z1 = _fjsp_unpacklo_v2r8(z1, x2);
567 y2 = _fjsp_unpacklo_v2r8(y2, z2);
568 x3 = _fjsp_unpacklo_v2r8(x3, y3);
569 z3 = _fjsp_unpacklo_v2r8(z3, x4);
570 y4 = _fjsp_unpacklo_v2r8(y4, z4);
572 _fjsp_storel_v2r8(ptrA, _fjsp_sub_v2r8( t1, x1 ));
573 _fjsp_storeh_v2r8(ptrA+1, _fjsp_sub_v2r8( t1, x1 ));
574 _fjsp_storel_v2r8(ptrA+2, _fjsp_sub_v2r8( t2, z1 ));
575 _fjsp_storeh_v2r8(ptrA+3, _fjsp_sub_v2r8( t2, z1 ));
576 _fjsp_storel_v2r8(ptrA+4, _fjsp_sub_v2r8( t3, y2 ));
577 _fjsp_storeh_v2r8(ptrA+5, _fjsp_sub_v2r8( t3, y2 ));
578 _fjsp_storel_v2r8(ptrA+6, _fjsp_sub_v2r8( t4, x3 ));
579 _fjsp_storeh_v2r8(ptrA+7, _fjsp_sub_v2r8( t4, x3 ));
580 _fjsp_storel_v2r8(ptrA+8, _fjsp_sub_v2r8( t5, z3 ));
581 _fjsp_storeh_v2r8(ptrA+9, _fjsp_sub_v2r8( t5, z3 ));
582 _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6, y4 ));
583 _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6, y4 ));
587 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
588 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
590 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7;
592 t1 = _fjsp_load_v2r8(ptrA);
593 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
594 t3 = _fjsp_load_v2r8(ptrB);
595 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
597 t5 = _fjsp_unpacklo_v2r8(x1, y1);
598 t6 = _fjsp_unpackhi_v2r8(x1, y1);
599 t7 = _fjsp_unpackhi_v2r8(z1, z1);
601 t1 = _fjsp_sub_v2r8(t1, t5);
602 t2 = _fjsp_sub_v2r8(t2, z1);
604 t3 = _fjsp_sub_v2r8(t3, t6);
605 t4 = _fjsp_sub_v2r8(t4, t7);
607 _fjsp_storel_v2r8(ptrA, t1);
608 _fjsp_storeh_v2r8(ptrA+1, t1);
609 _fjsp_storel_v2r8(ptrA+2, t2);
610 _fjsp_storel_v2r8(ptrB, t3);
611 _fjsp_storeh_v2r8(ptrB+1, t3);
612 _fjsp_storel_v2r8(ptrB+2, t4);
617 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
618 _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
620 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, fscalA, fscalB;
622 t1 = _fjsp_load_v2r8(ptrA);
623 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
624 t3 = _fjsp_load_v2r8(ptrB);
625 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
626 fscalA = _fjsp_unpacklo_v2r8(fscal, fscal);
627 fscalB = _fjsp_unpackhi_v2r8(fscal, fscal);
629 t5 = _fjsp_unpacklo_v2r8(dx1, dy1);
630 t6 = _fjsp_unpackhi_v2r8(dx1, dy1);
631 t7 = _fjsp_unpackhi_v2r8(dz1, dz1);
633 t1 = _fjsp_nmsub_v2r8(fscalA, t5, t1);
634 t2 = _fjsp_nmsub_v2r8(fscalA, dz1, t2);
636 t3 = _fjsp_nmsub_v2r8(fscalB, t6, t3);
637 t4 = _fjsp_nmsub_v2r8(fscalB, t7, t4);
639 _fjsp_storel_v2r8(ptrA, t1);
640 _fjsp_storeh_v2r8(ptrA+1, t1);
641 _fjsp_storel_v2r8(ptrA+2, t2);
642 _fjsp_storel_v2r8(ptrB, t3);
643 _fjsp_storeh_v2r8(ptrB+1, t3);
644 _fjsp_storel_v2r8(ptrB+2, t4);
649 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
650 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
651 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
652 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
654 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
655 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI;
657 t1 = _fjsp_load_v2r8(ptrA);
658 t2 = _fjsp_load_v2r8(ptrA+2);
659 t3 = _fjsp_load_v2r8(ptrA+4);
660 t4 = _fjsp_load_v2r8(ptrA+6);
661 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
662 t6 = _fjsp_load_v2r8(ptrB);
663 t7 = _fjsp_load_v2r8(ptrB+2);
664 t8 = _fjsp_load_v2r8(ptrB+4);
665 t9 = _fjsp_load_v2r8(ptrB+6);
666 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
668 tA = _fjsp_unpacklo_v2r8(x1, y1);
669 tB = _fjsp_unpackhi_v2r8(x1, y1);
670 tC = _fjsp_unpacklo_v2r8(z1, x2);
671 tD = _fjsp_unpackhi_v2r8(z1, x2);
672 tE = _fjsp_unpacklo_v2r8(y2, z2);
673 tF = _fjsp_unpackhi_v2r8(y2, z2);
674 tG = _fjsp_unpacklo_v2r8(x3, y3);
675 tH = _fjsp_unpackhi_v2r8(x3, y3);
676 tI = _fjsp_unpackhi_v2r8(z3, z3);
678 t1 = _fjsp_sub_v2r8(t1, tA);
679 t2 = _fjsp_sub_v2r8(t2, tC);
680 t3 = _fjsp_sub_v2r8(t3, tE);
681 t4 = _fjsp_sub_v2r8(t4, tG);
682 t5 = _fjsp_sub_v2r8(t5, z3);
684 t6 = _fjsp_sub_v2r8(t6, tB);
685 t7 = _fjsp_sub_v2r8(t7, tD);
686 t8 = _fjsp_sub_v2r8(t8, tF);
687 t9 = _fjsp_sub_v2r8(t9, tH);
688 t10 = _fjsp_sub_v2r8(t10, tI);
690 _fjsp_storel_v2r8(ptrA, t1);
691 _fjsp_storeh_v2r8(ptrA+1, t1);
692 _fjsp_storel_v2r8(ptrA+2, t2);
693 _fjsp_storeh_v2r8(ptrA+3, t2);
694 _fjsp_storel_v2r8(ptrA+4, t3);
695 _fjsp_storeh_v2r8(ptrA+5, t3);
696 _fjsp_storel_v2r8(ptrA+6, t4);
697 _fjsp_storeh_v2r8(ptrA+7, t4);
698 _fjsp_storel_v2r8(ptrA+8, t5);
699 _fjsp_storel_v2r8(ptrB, t6);
700 _fjsp_storeh_v2r8(ptrB+1, t6);
701 _fjsp_storel_v2r8(ptrB+2, t7);
702 _fjsp_storeh_v2r8(ptrB+3, t7);
703 _fjsp_storel_v2r8(ptrB+4, t8);
704 _fjsp_storeh_v2r8(ptrB+5, t8);
705 _fjsp_storel_v2r8(ptrB+6, t9);
706 _fjsp_storeh_v2r8(ptrB+7, t9);
707 _fjsp_storel_v2r8(ptrB+8, t10);
712 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
713 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
714 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
715 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
716 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
718 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
719 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
721 t1 = _fjsp_load_v2r8(ptrA);
722 t2 = _fjsp_load_v2r8(ptrA+2);
723 t3 = _fjsp_load_v2r8(ptrA+4);
724 t4 = _fjsp_load_v2r8(ptrA+6);
725 t5 = _fjsp_load_v2r8(ptrA+8);
726 t6 = _fjsp_load_v2r8(ptrA+10);
727 t7 = _fjsp_load_v2r8(ptrB);
728 t8 = _fjsp_load_v2r8(ptrB+2);
729 t9 = _fjsp_load_v2r8(ptrB+4);
730 t10 = _fjsp_load_v2r8(ptrB+6);
731 t11 = _fjsp_load_v2r8(ptrB+8);
732 t12 = _fjsp_load_v2r8(ptrB+10);
734 tA = _fjsp_unpacklo_v2r8(x1, y1);
735 tB = _fjsp_unpackhi_v2r8(x1, y1);
736 tC = _fjsp_unpacklo_v2r8(z1, x2);
737 tD = _fjsp_unpackhi_v2r8(z1, x2);
738 tE = _fjsp_unpacklo_v2r8(y2, z2);
739 tF = _fjsp_unpackhi_v2r8(y2, z2);
740 tG = _fjsp_unpacklo_v2r8(x3, y3);
741 tH = _fjsp_unpackhi_v2r8(x3, y3);
742 tI = _fjsp_unpacklo_v2r8(z3, x4);
743 tJ = _fjsp_unpackhi_v2r8(z3, x4);
744 tK = _fjsp_unpacklo_v2r8(y4, z4);
745 tL = _fjsp_unpackhi_v2r8(y4, z4);
747 t1 = _fjsp_sub_v2r8(t1, tA);
748 t2 = _fjsp_sub_v2r8(t2, tC);
749 t3 = _fjsp_sub_v2r8(t3, tE);
750 t4 = _fjsp_sub_v2r8(t4, tG);
751 t5 = _fjsp_sub_v2r8(t5, tI);
752 t6 = _fjsp_sub_v2r8(t6, tK);
754 t7 = _fjsp_sub_v2r8(t7, tB);
755 t8 = _fjsp_sub_v2r8(t8, tD);
756 t9 = _fjsp_sub_v2r8(t9, tF);
757 t10 = _fjsp_sub_v2r8(t10, tH);
758 t11 = _fjsp_sub_v2r8(t11, tJ);
759 t12 = _fjsp_sub_v2r8(t12, tL);
761 _fjsp_storel_v2r8(ptrA, t1);
762 _fjsp_storeh_v2r8(ptrA+1, t1);
763 _fjsp_storel_v2r8(ptrA+2, t2);
764 _fjsp_storeh_v2r8(ptrA+3, t2);
765 _fjsp_storel_v2r8(ptrA+4, t3);
766 _fjsp_storeh_v2r8(ptrA+5, t3);
767 _fjsp_storel_v2r8(ptrA+6, t4);
768 _fjsp_storeh_v2r8(ptrA+7, t4);
769 _fjsp_storel_v2r8(ptrA+8, t5);
770 _fjsp_storeh_v2r8(ptrA+9, t5);
771 _fjsp_storel_v2r8(ptrA+10, t6);
772 _fjsp_storeh_v2r8(ptrA+11, t6);
773 _fjsp_storel_v2r8(ptrB, t7);
774 _fjsp_storeh_v2r8(ptrB+1, t7);
775 _fjsp_storel_v2r8(ptrB+2, t8);
776 _fjsp_storeh_v2r8(ptrB+3, t8);
777 _fjsp_storel_v2r8(ptrB+4, t9);
778 _fjsp_storeh_v2r8(ptrB+5, t9);
779 _fjsp_storel_v2r8(ptrB+6, t10);
780 _fjsp_storeh_v2r8(ptrB+7, t10);
781 _fjsp_storel_v2r8(ptrB+8, t11);
782 _fjsp_storeh_v2r8(ptrB+9, t11);
783 _fjsp_storel_v2r8(ptrB+10, t12);
784 _fjsp_storeh_v2r8(ptrB+11, t12);
789 static gmx_inline void
790 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
791 double * gmx_restrict fptr,
792 double * gmx_restrict fshiftptr)
794 __m128d t1, t2, t3, t4;
798 fix1 = _fjsp_unpacklo_v2r8(fix1, fiy1); /* y0 x0 */
799 fiy1 = _fjsp_unpackhi_v2r8(t1, fiy1); /* y1 x1 */
801 fix1 = _fjsp_add_v2r8(fix1, fiy1);
802 fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1, fiz1 ));
804 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
805 _fjsp_storel_v2r8( fptr, t4 );
806 _fjsp_storeh_v2r8( fptr+1, t4 );
807 _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+2), fiz1 ));
809 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
810 _fjsp_storel_v2r8( fshiftptr, t4 );
811 _fjsp_storeh_v2r8( fshiftptr+1, t4 );
812 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
815 static gmx_inline void
816 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
817 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
818 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
819 double * gmx_restrict fptr,
820 double * gmx_restrict fshiftptr)
822 __m128d t1, t2, t3, t4, t5, t6;
825 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
826 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
827 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
829 fix3 = _fjsp_unpacklo_v2r8(fix3, fiy3); /* y0 x0 */
830 fiy3 = _fjsp_unpackhi_v2r8(t1, fiy3); /* y1 x1 */
832 fix1 = _fjsp_add_v2r8(fix1, fiy1);
833 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
834 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
836 fix3 = _fjsp_add_v2r8(fix3, fiy3);
837 fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3, fiz3));
839 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
840 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
841 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
842 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
844 _fjsp_storel_v2r8( fptr, t3 );
845 _fjsp_storeh_v2r8( fptr+1, t3 );
846 _fjsp_storel_v2r8( fptr+2, t4 );
847 _fjsp_storeh_v2r8( fptr+3, t4 );
848 _fjsp_storel_v2r8( fptr+4, t5 );
849 _fjsp_storeh_v2r8( fptr+5, t5 );
850 _fjsp_storel_v2r8( fptr+6, t6 );
851 _fjsp_storeh_v2r8( fptr+7, t6 );
852 _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+8), fiz3 ));
854 fix1 = _fjsp_add_v2r8(fix1, fix3);
855 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
856 fix1 = _fjsp_add_v2r8(fix1, t1); /* x and y sums */
858 t2 = _fjsp_shuffle_v2r8(fiy2, fiy2, GMX_FJSP_SHUFFLE2(1, 1));
859 fiz1 = _fjsp_add_v2r8(fiz1, fiz3);
860 fiz1 = _fjsp_add_v2r8(fiz1, t2); /* z sum */
862 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
863 _fjsp_storel_v2r8( fshiftptr, t3 );
864 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
865 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
869 static gmx_inline void
870 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
871 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
872 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
873 _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
874 double * gmx_restrict fptr,
875 double * gmx_restrict fshiftptr)
877 __m128d t1, t2, t3, t4, t5, t6, t7, t8;
880 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
881 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
882 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
883 GMX_FJSP_TRANSPOSE2_V2R8(fix3, fiy3);
884 GMX_FJSP_TRANSPOSE2_V2R8(fiz3, fix4);
885 GMX_FJSP_TRANSPOSE2_V2R8(fiy4, fiz4);
887 fix1 = _fjsp_add_v2r8(fix1, fiy1);
888 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
889 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
890 fix3 = _fjsp_add_v2r8(fix3, fiy3);
891 fiz3 = _fjsp_add_v2r8(fiz3, fix4);
892 fiy4 = _fjsp_add_v2r8(fiy4, fiz4);
894 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
895 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
896 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
897 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
898 t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8), fiz3 );
899 t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
900 _fjsp_storel_v2r8( fptr, t3 );
901 _fjsp_storeh_v2r8( fptr+1, t3 );
902 _fjsp_storel_v2r8( fptr+2, t4 );
903 _fjsp_storeh_v2r8( fptr+3, t4 );
904 _fjsp_storel_v2r8( fptr+4, t5 );
905 _fjsp_storeh_v2r8( fptr+5, t5 );
906 _fjsp_storel_v2r8( fptr+6, t6 );
907 _fjsp_storeh_v2r8( fptr+7, t6 );
908 _fjsp_storel_v2r8( fptr+8, t7 );
909 _fjsp_storeh_v2r8( fptr+9, t7 );
910 _fjsp_storel_v2r8( fptr+10, t8 );
911 _fjsp_storeh_v2r8( fptr+11, t8 );
913 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
914 fix1 = _fjsp_add_v2r8(fix1, t1);
915 t2 = _fjsp_shuffle_v2r8(fiz3, fiy4, GMX_FJSP_SHUFFLE2(0, 1));
916 fix3 = _fjsp_add_v2r8(fix3, t2);
917 fix1 = _fjsp_add_v2r8(fix1, fix3); /* x and y sums */
919 fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2, fiy2));
920 fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4, fiy4));
921 fiz1 = _fjsp_add_v2r8(fiz1, fiz3); /* z sum */
923 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
924 _fjsp_storel_v2r8( fshiftptr, t3 );
925 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
926 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
931 static gmx_inline void
932 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
934 pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1, pot1));
935 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
938 static gmx_inline void
939 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
940 _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
942 GMX_FJSP_TRANSPOSE2_V2R8(pot1, pot2);
943 pot1 = _fjsp_add_v2r8(pot1, pot2);
944 pot2 = _fjsp_unpackhi_v2r8(pot1, pot1);
946 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
947 _fjsp_storel_v2r8(ptrB, _fjsp_add_v2r8(pot2, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB)));
951 #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */