2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_sparc64_hpc_ace_double_h_
36 #define _kernelutil_sparc64_hpc_ace_double_h_
38 /* Get gmx_simd_exp_d() */
39 #include "gromacs/simd/simd.h"
40 #include "gromacs/simd/simd_math.h"
42 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases.
43 * Environment/compiler version GM-1.2.0-17 seems to be buggy; when -Xg is
44 * defined to enable GNUC extensions, this sets _ISOC99_SOURCE, which in
45 * turn causes all intrinsics to be declared inline _instead_ of static. This
46 * leads to duplicate symbol errors at link time.
47 * To work around this we unset this before including the HPC-ACE header, and
48 * reset the value afterwards.
51 # undef _ISOC99_SOURCE
52 # define SAVE_ISOC99_SOURCE
55 #include <emmintrin.h>
57 #ifdef SAVE_ISOC99_SOURCE
58 # define _ISOC99_SOURCE
59 # undef SAVE_ISOC99_SOURCE
62 #define GMX_FJSP_SHUFFLE2(x, y) (((x)<<1) | (y))
64 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) { \
65 _fjsp_v2r8 __gmx_t1 = row0; \
66 row0 = _fjsp_unpacklo_v2r8(row0, row1); \
67 row1 = _fjsp_unpackhi_v2r8(__gmx_t1, row1); \
72 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
76 _fjsp_storel_v2r8(&lo, a);
77 _fjsp_storeh_v2r8(&hi, a);
78 printf("%s: %g %g\n", s, lo, hi);
83 gmx_fjsp_set1_v2r8(double d)
85 return _fjsp_set_v2r8(d, d);
89 gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
91 return gmx_fjsp_set1_v2r8(*ptr);
96 gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
105 a = _fjsp_cmplt_v2r8(a, b);
106 a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a, a));
107 _fjsp_storel_v2r8(&(conv.d), a);
108 return (conv.i != 0);
112 static gmx_inline _fjsp_v2r8
113 gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
115 const _fjsp_v2r8 half = gmx_fjsp_set1_v2r8(0.5);
116 const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
117 _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
119 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
120 /* The HPC-ACE instruction set is only available in double precision, while
121 * single precision is typically sufficient for Gromacs. If you define
122 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
123 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
124 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
126 #ifndef GMX_RELAXED_DOUBLE_PRECISION
127 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
129 return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
134 static gmx_inline _fjsp_v2r8
135 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
137 const _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
138 __m128d lu = _fjsp_rcpa_v2r8(x);
140 /* Perform three N-R steps for double precision */
141 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
142 /* The HPC-ACE instruction set is only available in double precision, while
143 * single precision is typically sufficient for Gromacs. If you define
144 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
145 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
146 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
148 #ifndef GMX_RELAXED_DOUBLE_PRECISION
149 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
151 return _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
155 static gmx_inline _fjsp_v2r8
156 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
158 return _fjsp_madd_v2r8(dx, dx, _fjsp_madd_v2r8(dy, dy, _fjsp_mul_v2r8(dz, dz)));
161 /* Normal sum of four ymm registers */
162 #define gmx_fjsp_sum4_v2r8(t0, t1, t2, t3) _fjsp_add_v2r8(_fjsp_add_v2r8(t0, t1), _fjsp_add_v2r8(t2, t3))
169 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
170 const double * gmx_restrict ptrB)
172 return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA), _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
176 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
178 return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
183 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
184 double * gmx_restrict ptrB,
189 t2 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
190 _fjsp_storel_v2r8(ptrA, xmm1);
191 _fjsp_storel_v2r8(ptrB, t2);
195 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
197 _fjsp_storel_v2r8(ptrA, xmm1);
201 /* Similar to store, but increments value in memory */
203 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
204 double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
208 t1 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
209 xmm1 = _fjsp_add_v2r8(xmm1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA));
210 t1 = _fjsp_add_v2r8(t1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
211 _fjsp_storel_v2r8(ptrA, xmm1);
212 _fjsp_storel_v2r8(ptrB, t1);
216 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
220 tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
221 tmp = _fjsp_add_v2r8(tmp, xmm1);
222 _fjsp_storel_v2r8(ptrA, tmp);
227 static gmx_inline void
228 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
229 const double * gmx_restrict p2,
230 _fjsp_v2r8 * gmx_restrict c6,
231 _fjsp_v2r8 * gmx_restrict c12)
233 _fjsp_v2r8 t1, t2, t3;
235 /* The c6/c12 array should be aligned */
236 t1 = _fjsp_load_v2r8(p1);
237 t2 = _fjsp_load_v2r8(p2);
238 *c6 = _fjsp_unpacklo_v2r8(t1, t2);
239 *c12 = _fjsp_unpackhi_v2r8(t1, t2);
242 static gmx_inline void
243 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
244 _fjsp_v2r8 * gmx_restrict c6,
245 _fjsp_v2r8 * gmx_restrict c12)
247 *c6 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
248 *c12 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
252 static gmx_inline void
253 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
254 const double * gmx_restrict xyz,
255 _fjsp_v2r8 * gmx_restrict x1,
256 _fjsp_v2r8 * gmx_restrict y1,
257 _fjsp_v2r8 * gmx_restrict z1)
259 _fjsp_v2r8 mem_xy, mem_z, mem_sxy, mem_sz;
261 mem_xy = _fjsp_load_v2r8(xyz);
262 mem_z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+2);
263 mem_sxy = _fjsp_load_v2r8(xyz_shift);
264 mem_sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
266 mem_xy = _fjsp_add_v2r8(mem_xy, mem_sxy);
267 mem_z = _fjsp_add_v2r8(mem_z, mem_sz);
269 *x1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(0, 0));
270 *y1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(1, 1));
271 *z1 = _fjsp_shuffle_v2r8(mem_z, mem_z, GMX_FJSP_SHUFFLE2(0, 0));
275 static gmx_inline void
276 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
277 const double * gmx_restrict xyz,
278 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
279 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
280 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
282 _fjsp_v2r8 t1, t2, t3, t4, t5, sxy, sz, szx, syz;
284 t1 = _fjsp_load_v2r8(xyz);
285 t2 = _fjsp_load_v2r8(xyz+2);
286 t3 = _fjsp_load_v2r8(xyz+4);
287 t4 = _fjsp_load_v2r8(xyz+6);
288 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+8);
290 sxy = _fjsp_load_v2r8(xyz_shift);
291 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
292 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
293 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
295 t1 = _fjsp_add_v2r8(t1, sxy);
296 t2 = _fjsp_add_v2r8(t2, szx);
297 t3 = _fjsp_add_v2r8(t3, syz);
298 t4 = _fjsp_add_v2r8(t4, sxy);
299 t5 = _fjsp_add_v2r8(t5, sz);
301 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
302 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
303 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
304 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
305 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
306 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
307 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
308 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
309 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
313 static gmx_inline void
314 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
315 const double * gmx_restrict xyz,
316 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
317 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
318 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
319 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
321 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
323 t1 = _fjsp_load_v2r8(xyz);
324 t2 = _fjsp_load_v2r8(xyz+2);
325 t3 = _fjsp_load_v2r8(xyz+4);
326 t4 = _fjsp_load_v2r8(xyz+6);
327 t5 = _fjsp_load_v2r8(xyz+8);
328 t6 = _fjsp_load_v2r8(xyz+10);
330 sxy = _fjsp_load_v2r8(xyz_shift);
331 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
332 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
333 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
335 t1 = _fjsp_add_v2r8(t1, sxy);
336 t2 = _fjsp_add_v2r8(t2, szx);
337 t3 = _fjsp_add_v2r8(t3, syz);
338 t4 = _fjsp_add_v2r8(t4, sxy);
339 t5 = _fjsp_add_v2r8(t5, szx);
340 t6 = _fjsp_add_v2r8(t6, syz);
342 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
343 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
344 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
345 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
346 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
347 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
348 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
349 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
350 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
351 *x4 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(1, 1));
352 *y4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(0, 0));
353 *z4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(1, 1));
358 static gmx_inline void
359 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
360 _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
362 *x = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
363 *y = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
364 *z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
367 static gmx_inline void
368 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
369 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
370 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
371 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
373 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
374 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
375 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
376 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
377 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
378 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
379 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
380 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
381 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
384 static gmx_inline void
385 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
386 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
387 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
388 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
389 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
391 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
392 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
393 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
394 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
395 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
396 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
397 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
398 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
399 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
400 *x4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+9);
401 *y4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+10);
402 *z4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+11);
406 static gmx_inline void
407 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
408 const double * gmx_restrict ptrB,
409 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
411 _fjsp_v2r8 t1, t2, t3, t4;
412 t1 = _fjsp_load_v2r8(ptrA);
413 t2 = _fjsp_load_v2r8(ptrB);
414 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
415 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
416 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
419 *z1 = _fjsp_unpacklo_v2r8(t3, t4);
422 static gmx_inline void
423 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
424 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
425 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
426 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
428 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
429 t1 = _fjsp_load_v2r8(ptrA);
430 t2 = _fjsp_load_v2r8(ptrB);
431 t3 = _fjsp_load_v2r8(ptrA+2);
432 t4 = _fjsp_load_v2r8(ptrB+2);
433 t5 = _fjsp_load_v2r8(ptrA+4);
434 t6 = _fjsp_load_v2r8(ptrB+4);
435 t7 = _fjsp_load_v2r8(ptrA+6);
436 t8 = _fjsp_load_v2r8(ptrB+6);
437 t9 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
438 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
439 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
440 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
441 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
442 GMX_FJSP_TRANSPOSE2_V2R8(t7, t8);
451 *z3 = _fjsp_unpacklo_v2r8(t9, t10);
455 static gmx_inline void
456 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
457 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
458 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
459 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
460 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
462 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
463 t1 = _fjsp_load_v2r8(ptrA);
464 t2 = _fjsp_load_v2r8(ptrB);
465 t3 = _fjsp_load_v2r8(ptrA+2);
466 t4 = _fjsp_load_v2r8(ptrB+2);
467 t5 = _fjsp_load_v2r8(ptrA+4);
468 t6 = _fjsp_load_v2r8(ptrB+4);
469 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
470 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
471 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
478 t1 = _fjsp_load_v2r8(ptrA+6);
479 t2 = _fjsp_load_v2r8(ptrB+6);
480 t3 = _fjsp_load_v2r8(ptrA+8);
481 t4 = _fjsp_load_v2r8(ptrB+8);
482 t5 = _fjsp_load_v2r8(ptrA+10);
483 t6 = _fjsp_load_v2r8(ptrB+10);
484 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
485 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
486 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
497 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
498 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
500 _fjsp_v2r8 t1, t2, t3;
502 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
503 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
504 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
506 t1 = _fjsp_sub_v2r8(t1, x1);
507 t2 = _fjsp_sub_v2r8(t2, y1);
508 t3 = _fjsp_sub_v2r8(t3, z1);
509 _fjsp_storel_v2r8(ptrA, t1);
510 _fjsp_storel_v2r8(ptrA+1, t2);
511 _fjsp_storel_v2r8(ptrA+2, t3);
515 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
516 _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
518 _fjsp_v2r8 t1, t2, t3;
520 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
521 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
522 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
524 t1 = _fjsp_nmsub_v2r8(fscal, dx1, t1);
525 t2 = _fjsp_nmsub_v2r8(fscal, dy1, t2);
526 t3 = _fjsp_nmsub_v2r8(fscal, dz1, t3);
527 _fjsp_storel_v2r8(ptrA, t1);
528 _fjsp_storel_v2r8(ptrA+1, t2);
529 _fjsp_storel_v2r8(ptrA+2, t3);
534 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
535 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
536 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
537 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
539 _fjsp_v2r8 t1, t2, t3, t4, t5;
541 t1 = _fjsp_load_v2r8(ptrA);
542 t2 = _fjsp_load_v2r8(ptrA+2);
543 t3 = _fjsp_load_v2r8(ptrA+4);
544 t4 = _fjsp_load_v2r8(ptrA+6);
545 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
547 x1 = _fjsp_unpacklo_v2r8(x1, y1);
548 z1 = _fjsp_unpacklo_v2r8(z1, x2);
549 y2 = _fjsp_unpacklo_v2r8(y2, z2);
550 x3 = _fjsp_unpacklo_v2r8(x3, y3);
551 /* nothing to be done for z3 */
553 t1 = _fjsp_sub_v2r8(t1, x1);
554 t2 = _fjsp_sub_v2r8(t2, z1);
555 t3 = _fjsp_sub_v2r8(t3, y2);
556 t4 = _fjsp_sub_v2r8(t4, x3);
557 t5 = _fjsp_sub_v2r8(t5, z3);
558 _fjsp_storel_v2r8(ptrA, t1);
559 _fjsp_storeh_v2r8(ptrA+1, t1);
560 _fjsp_storel_v2r8(ptrA+2, t2);
561 _fjsp_storeh_v2r8(ptrA+3, t2);
562 _fjsp_storel_v2r8(ptrA+4, t3);
563 _fjsp_storeh_v2r8(ptrA+5, t3);
564 _fjsp_storel_v2r8(ptrA+6, t4);
565 _fjsp_storeh_v2r8(ptrA+7, t4);
566 _fjsp_storel_v2r8(ptrA+8, t5);
571 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
572 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
573 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
574 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
575 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
577 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
579 t1 = _fjsp_load_v2r8(ptrA);
580 t2 = _fjsp_load_v2r8(ptrA+2);
581 t3 = _fjsp_load_v2r8(ptrA+4);
582 t4 = _fjsp_load_v2r8(ptrA+6);
583 t5 = _fjsp_load_v2r8(ptrA+8);
584 t6 = _fjsp_load_v2r8(ptrA+10);
586 x1 = _fjsp_unpacklo_v2r8(x1, y1);
587 z1 = _fjsp_unpacklo_v2r8(z1, x2);
588 y2 = _fjsp_unpacklo_v2r8(y2, z2);
589 x3 = _fjsp_unpacklo_v2r8(x3, y3);
590 z3 = _fjsp_unpacklo_v2r8(z3, x4);
591 y4 = _fjsp_unpacklo_v2r8(y4, z4);
593 _fjsp_storel_v2r8(ptrA, _fjsp_sub_v2r8( t1, x1 ));
594 _fjsp_storeh_v2r8(ptrA+1, _fjsp_sub_v2r8( t1, x1 ));
595 _fjsp_storel_v2r8(ptrA+2, _fjsp_sub_v2r8( t2, z1 ));
596 _fjsp_storeh_v2r8(ptrA+3, _fjsp_sub_v2r8( t2, z1 ));
597 _fjsp_storel_v2r8(ptrA+4, _fjsp_sub_v2r8( t3, y2 ));
598 _fjsp_storeh_v2r8(ptrA+5, _fjsp_sub_v2r8( t3, y2 ));
599 _fjsp_storel_v2r8(ptrA+6, _fjsp_sub_v2r8( t4, x3 ));
600 _fjsp_storeh_v2r8(ptrA+7, _fjsp_sub_v2r8( t4, x3 ));
601 _fjsp_storel_v2r8(ptrA+8, _fjsp_sub_v2r8( t5, z3 ));
602 _fjsp_storeh_v2r8(ptrA+9, _fjsp_sub_v2r8( t5, z3 ));
603 _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6, y4 ));
604 _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6, y4 ));
608 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
609 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
611 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7;
613 t1 = _fjsp_load_v2r8(ptrA);
614 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
615 t3 = _fjsp_load_v2r8(ptrB);
616 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
618 t5 = _fjsp_unpacklo_v2r8(x1, y1);
619 t6 = _fjsp_unpackhi_v2r8(x1, y1);
620 t7 = _fjsp_unpackhi_v2r8(z1, z1);
622 t1 = _fjsp_sub_v2r8(t1, t5);
623 t2 = _fjsp_sub_v2r8(t2, z1);
625 t3 = _fjsp_sub_v2r8(t3, t6);
626 t4 = _fjsp_sub_v2r8(t4, t7);
628 _fjsp_storel_v2r8(ptrA, t1);
629 _fjsp_storeh_v2r8(ptrA+1, t1);
630 _fjsp_storel_v2r8(ptrA+2, t2);
631 _fjsp_storel_v2r8(ptrB, t3);
632 _fjsp_storeh_v2r8(ptrB+1, t3);
633 _fjsp_storel_v2r8(ptrB+2, t4);
638 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
639 _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
641 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, fscalA, fscalB;
643 t1 = _fjsp_load_v2r8(ptrA);
644 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
645 t3 = _fjsp_load_v2r8(ptrB);
646 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
647 fscalA = _fjsp_unpacklo_v2r8(fscal, fscal);
648 fscalB = _fjsp_unpackhi_v2r8(fscal, fscal);
650 t5 = _fjsp_unpacklo_v2r8(dx1, dy1);
651 t6 = _fjsp_unpackhi_v2r8(dx1, dy1);
652 t7 = _fjsp_unpackhi_v2r8(dz1, dz1);
654 t1 = _fjsp_nmsub_v2r8(fscalA, t5, t1);
655 t2 = _fjsp_nmsub_v2r8(fscalA, dz1, t2);
657 t3 = _fjsp_nmsub_v2r8(fscalB, t6, t3);
658 t4 = _fjsp_nmsub_v2r8(fscalB, t7, t4);
660 _fjsp_storel_v2r8(ptrA, t1);
661 _fjsp_storeh_v2r8(ptrA+1, t1);
662 _fjsp_storel_v2r8(ptrA+2, t2);
663 _fjsp_storel_v2r8(ptrB, t3);
664 _fjsp_storeh_v2r8(ptrB+1, t3);
665 _fjsp_storel_v2r8(ptrB+2, t4);
670 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
671 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
672 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
673 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
675 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
676 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI;
678 t1 = _fjsp_load_v2r8(ptrA);
679 t2 = _fjsp_load_v2r8(ptrA+2);
680 t3 = _fjsp_load_v2r8(ptrA+4);
681 t4 = _fjsp_load_v2r8(ptrA+6);
682 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
683 t6 = _fjsp_load_v2r8(ptrB);
684 t7 = _fjsp_load_v2r8(ptrB+2);
685 t8 = _fjsp_load_v2r8(ptrB+4);
686 t9 = _fjsp_load_v2r8(ptrB+6);
687 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
689 tA = _fjsp_unpacklo_v2r8(x1, y1);
690 tB = _fjsp_unpackhi_v2r8(x1, y1);
691 tC = _fjsp_unpacklo_v2r8(z1, x2);
692 tD = _fjsp_unpackhi_v2r8(z1, x2);
693 tE = _fjsp_unpacklo_v2r8(y2, z2);
694 tF = _fjsp_unpackhi_v2r8(y2, z2);
695 tG = _fjsp_unpacklo_v2r8(x3, y3);
696 tH = _fjsp_unpackhi_v2r8(x3, y3);
697 tI = _fjsp_unpackhi_v2r8(z3, z3);
699 t1 = _fjsp_sub_v2r8(t1, tA);
700 t2 = _fjsp_sub_v2r8(t2, tC);
701 t3 = _fjsp_sub_v2r8(t3, tE);
702 t4 = _fjsp_sub_v2r8(t4, tG);
703 t5 = _fjsp_sub_v2r8(t5, z3);
705 t6 = _fjsp_sub_v2r8(t6, tB);
706 t7 = _fjsp_sub_v2r8(t7, tD);
707 t8 = _fjsp_sub_v2r8(t8, tF);
708 t9 = _fjsp_sub_v2r8(t9, tH);
709 t10 = _fjsp_sub_v2r8(t10, tI);
711 _fjsp_storel_v2r8(ptrA, t1);
712 _fjsp_storeh_v2r8(ptrA+1, t1);
713 _fjsp_storel_v2r8(ptrA+2, t2);
714 _fjsp_storeh_v2r8(ptrA+3, t2);
715 _fjsp_storel_v2r8(ptrA+4, t3);
716 _fjsp_storeh_v2r8(ptrA+5, t3);
717 _fjsp_storel_v2r8(ptrA+6, t4);
718 _fjsp_storeh_v2r8(ptrA+7, t4);
719 _fjsp_storel_v2r8(ptrA+8, t5);
720 _fjsp_storel_v2r8(ptrB, t6);
721 _fjsp_storeh_v2r8(ptrB+1, t6);
722 _fjsp_storel_v2r8(ptrB+2, t7);
723 _fjsp_storeh_v2r8(ptrB+3, t7);
724 _fjsp_storel_v2r8(ptrB+4, t8);
725 _fjsp_storeh_v2r8(ptrB+5, t8);
726 _fjsp_storel_v2r8(ptrB+6, t9);
727 _fjsp_storeh_v2r8(ptrB+7, t9);
728 _fjsp_storel_v2r8(ptrB+8, t10);
733 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
734 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
735 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
736 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
737 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
739 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
740 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
742 t1 = _fjsp_load_v2r8(ptrA);
743 t2 = _fjsp_load_v2r8(ptrA+2);
744 t3 = _fjsp_load_v2r8(ptrA+4);
745 t4 = _fjsp_load_v2r8(ptrA+6);
746 t5 = _fjsp_load_v2r8(ptrA+8);
747 t6 = _fjsp_load_v2r8(ptrA+10);
748 t7 = _fjsp_load_v2r8(ptrB);
749 t8 = _fjsp_load_v2r8(ptrB+2);
750 t9 = _fjsp_load_v2r8(ptrB+4);
751 t10 = _fjsp_load_v2r8(ptrB+6);
752 t11 = _fjsp_load_v2r8(ptrB+8);
753 t12 = _fjsp_load_v2r8(ptrB+10);
755 tA = _fjsp_unpacklo_v2r8(x1, y1);
756 tB = _fjsp_unpackhi_v2r8(x1, y1);
757 tC = _fjsp_unpacklo_v2r8(z1, x2);
758 tD = _fjsp_unpackhi_v2r8(z1, x2);
759 tE = _fjsp_unpacklo_v2r8(y2, z2);
760 tF = _fjsp_unpackhi_v2r8(y2, z2);
761 tG = _fjsp_unpacklo_v2r8(x3, y3);
762 tH = _fjsp_unpackhi_v2r8(x3, y3);
763 tI = _fjsp_unpacklo_v2r8(z3, x4);
764 tJ = _fjsp_unpackhi_v2r8(z3, x4);
765 tK = _fjsp_unpacklo_v2r8(y4, z4);
766 tL = _fjsp_unpackhi_v2r8(y4, z4);
768 t1 = _fjsp_sub_v2r8(t1, tA);
769 t2 = _fjsp_sub_v2r8(t2, tC);
770 t3 = _fjsp_sub_v2r8(t3, tE);
771 t4 = _fjsp_sub_v2r8(t4, tG);
772 t5 = _fjsp_sub_v2r8(t5, tI);
773 t6 = _fjsp_sub_v2r8(t6, tK);
775 t7 = _fjsp_sub_v2r8(t7, tB);
776 t8 = _fjsp_sub_v2r8(t8, tD);
777 t9 = _fjsp_sub_v2r8(t9, tF);
778 t10 = _fjsp_sub_v2r8(t10, tH);
779 t11 = _fjsp_sub_v2r8(t11, tJ);
780 t12 = _fjsp_sub_v2r8(t12, tL);
782 _fjsp_storel_v2r8(ptrA, t1);
783 _fjsp_storeh_v2r8(ptrA+1, t1);
784 _fjsp_storel_v2r8(ptrA+2, t2);
785 _fjsp_storeh_v2r8(ptrA+3, t2);
786 _fjsp_storel_v2r8(ptrA+4, t3);
787 _fjsp_storeh_v2r8(ptrA+5, t3);
788 _fjsp_storel_v2r8(ptrA+6, t4);
789 _fjsp_storeh_v2r8(ptrA+7, t4);
790 _fjsp_storel_v2r8(ptrA+8, t5);
791 _fjsp_storeh_v2r8(ptrA+9, t5);
792 _fjsp_storel_v2r8(ptrA+10, t6);
793 _fjsp_storeh_v2r8(ptrA+11, t6);
794 _fjsp_storel_v2r8(ptrB, t7);
795 _fjsp_storeh_v2r8(ptrB+1, t7);
796 _fjsp_storel_v2r8(ptrB+2, t8);
797 _fjsp_storeh_v2r8(ptrB+3, t8);
798 _fjsp_storel_v2r8(ptrB+4, t9);
799 _fjsp_storeh_v2r8(ptrB+5, t9);
800 _fjsp_storel_v2r8(ptrB+6, t10);
801 _fjsp_storeh_v2r8(ptrB+7, t10);
802 _fjsp_storel_v2r8(ptrB+8, t11);
803 _fjsp_storeh_v2r8(ptrB+9, t11);
804 _fjsp_storel_v2r8(ptrB+10, t12);
805 _fjsp_storeh_v2r8(ptrB+11, t12);
810 static gmx_inline void
811 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
812 double * gmx_restrict fptr,
813 double * gmx_restrict fshiftptr)
815 __m128d t1, t2, t3, t4;
819 fix1 = _fjsp_unpacklo_v2r8(fix1, fiy1); /* y0 x0 */
820 fiy1 = _fjsp_unpackhi_v2r8(t1, fiy1); /* y1 x1 */
822 fix1 = _fjsp_add_v2r8(fix1, fiy1);
823 fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1, fiz1 ));
825 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
826 _fjsp_storel_v2r8( fptr, t4 );
827 _fjsp_storeh_v2r8( fptr+1, t4 );
828 _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+2), fiz1 ));
830 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
831 _fjsp_storel_v2r8( fshiftptr, t4 );
832 _fjsp_storeh_v2r8( fshiftptr+1, t4 );
833 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
836 static gmx_inline void
837 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
838 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
839 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
840 double * gmx_restrict fptr,
841 double * gmx_restrict fshiftptr)
843 __m128d t1, t2, t3, t4, t5, t6;
846 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
847 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
848 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
850 fix3 = _fjsp_unpacklo_v2r8(fix3, fiy3); /* y0 x0 */
851 fiy3 = _fjsp_unpackhi_v2r8(t1, fiy3); /* y1 x1 */
853 fix1 = _fjsp_add_v2r8(fix1, fiy1);
854 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
855 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
857 fix3 = _fjsp_add_v2r8(fix3, fiy3);
858 fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3, fiz3));
860 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
861 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
862 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
863 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
865 _fjsp_storel_v2r8( fptr, t3 );
866 _fjsp_storeh_v2r8( fptr+1, t3 );
867 _fjsp_storel_v2r8( fptr+2, t4 );
868 _fjsp_storeh_v2r8( fptr+3, t4 );
869 _fjsp_storel_v2r8( fptr+4, t5 );
870 _fjsp_storeh_v2r8( fptr+5, t5 );
871 _fjsp_storel_v2r8( fptr+6, t6 );
872 _fjsp_storeh_v2r8( fptr+7, t6 );
873 _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+8), fiz3 ));
875 fix1 = _fjsp_add_v2r8(fix1, fix3);
876 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
877 fix1 = _fjsp_add_v2r8(fix1, t1); /* x and y sums */
879 t2 = _fjsp_shuffle_v2r8(fiy2, fiy2, GMX_FJSP_SHUFFLE2(1, 1));
880 fiz1 = _fjsp_add_v2r8(fiz1, fiz3);
881 fiz1 = _fjsp_add_v2r8(fiz1, t2); /* z sum */
883 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
884 _fjsp_storel_v2r8( fshiftptr, t3 );
885 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
886 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
890 static gmx_inline void
891 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
892 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
893 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
894 _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
895 double * gmx_restrict fptr,
896 double * gmx_restrict fshiftptr)
898 __m128d t1, t2, t3, t4, t5, t6, t7, t8;
901 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
902 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
903 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
904 GMX_FJSP_TRANSPOSE2_V2R8(fix3, fiy3);
905 GMX_FJSP_TRANSPOSE2_V2R8(fiz3, fix4);
906 GMX_FJSP_TRANSPOSE2_V2R8(fiy4, fiz4);
908 fix1 = _fjsp_add_v2r8(fix1, fiy1);
909 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
910 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
911 fix3 = _fjsp_add_v2r8(fix3, fiy3);
912 fiz3 = _fjsp_add_v2r8(fiz3, fix4);
913 fiy4 = _fjsp_add_v2r8(fiy4, fiz4);
915 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
916 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
917 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
918 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
919 t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8), fiz3 );
920 t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
921 _fjsp_storel_v2r8( fptr, t3 );
922 _fjsp_storeh_v2r8( fptr+1, t3 );
923 _fjsp_storel_v2r8( fptr+2, t4 );
924 _fjsp_storeh_v2r8( fptr+3, t4 );
925 _fjsp_storel_v2r8( fptr+4, t5 );
926 _fjsp_storeh_v2r8( fptr+5, t5 );
927 _fjsp_storel_v2r8( fptr+6, t6 );
928 _fjsp_storeh_v2r8( fptr+7, t6 );
929 _fjsp_storel_v2r8( fptr+8, t7 );
930 _fjsp_storeh_v2r8( fptr+9, t7 );
931 _fjsp_storel_v2r8( fptr+10, t8 );
932 _fjsp_storeh_v2r8( fptr+11, t8 );
934 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
935 fix1 = _fjsp_add_v2r8(fix1, t1);
936 t2 = _fjsp_shuffle_v2r8(fiz3, fiy4, GMX_FJSP_SHUFFLE2(0, 1));
937 fix3 = _fjsp_add_v2r8(fix3, t2);
938 fix1 = _fjsp_add_v2r8(fix1, fix3); /* x and y sums */
940 fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2, fiy2));
941 fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4, fiy4));
942 fiz1 = _fjsp_add_v2r8(fiz1, fiz3); /* z sum */
944 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
945 _fjsp_storel_v2r8( fshiftptr, t3 );
946 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
947 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
952 static gmx_inline void
953 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
955 pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1, pot1));
956 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
959 static gmx_inline void
960 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
961 _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
963 GMX_FJSP_TRANSPOSE2_V2R8(pot1, pot2);
964 pot1 = _fjsp_add_v2r8(pot1, pot2);
965 pot2 = _fjsp_unpackhi_v2r8(pot1, pot1);
967 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
968 _fjsp_storel_v2r8(ptrB, _fjsp_add_v2r8(pot2, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB)));
972 #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */