2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_sparc64_hpc_ace_double_h_
36 #define _kernelutil_sparc64_hpc_ace_double_h_
38 /* Get gmx_simd_exp_d() */
39 #include "gromacs/simd/simd.h"
40 #include "gromacs/simd/simd_math.h"
42 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
43 #include <emmintrin.h>
45 #define GMX_FJSP_SHUFFLE2(x, y) (((x)<<1) | (y))
47 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) { \
48 _fjsp_v2r8 __gmx_t1 = row0; \
49 row0 = _fjsp_unpacklo_v2r8(row0, row1); \
50 row1 = _fjsp_unpackhi_v2r8(__gmx_t1, row1); \
55 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
59 _fjsp_storel_v2r8(&lo, a);
60 _fjsp_storeh_v2r8(&hi, a);
61 printf("%s: %g %g\n", s, lo, hi);
66 gmx_fjsp_set1_v2r8(double d)
68 return _fjsp_set_v2r8(d, d);
72 gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
74 return gmx_fjsp_set1_v2r8(*ptr);
79 gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
88 a = _fjsp_cmplt_v2r8(a, b);
89 a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a, a));
90 _fjsp_storel_v2r8(&(conv.d), a);
95 static gmx_inline _fjsp_v2r8
96 gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
98 const _fjsp_v2r8 half = gmx_fjsp_set1_v2r8(0.5);
99 const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
100 _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
102 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
103 /* The HPC-ACE instruction set is only available in double precision, while
104 * single precision is typically sufficient for Gromacs. If you define
105 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
106 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
107 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
109 #ifndef GMX_RELAXED_DOUBLE_PRECISION
110 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
112 return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half, lu), _fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu, lu), x, three));
117 static gmx_inline _fjsp_v2r8
118 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
120 const _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
121 __m128d lu = _fjsp_rcpa_v2r8(x);
123 /* Perform three N-R steps for double precision */
124 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
125 /* The HPC-ACE instruction set is only available in double precision, while
126 * single precision is typically sufficient for Gromacs. If you define
127 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
128 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
129 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
131 #ifndef GMX_RELAXED_DOUBLE_PRECISION
132 lu = _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
134 return _fjsp_mul_v2r8(lu, _fjsp_nmsub_v2r8(lu, x, two));
138 static gmx_inline _fjsp_v2r8
139 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
141 return _fjsp_madd_v2r8(dx, dx, _fjsp_madd_v2r8(dy, dy, _fjsp_mul_v2r8(dz, dz)));
144 /* Normal sum of four ymm registers */
145 #define gmx_fjsp_sum4_v2r8(t0, t1, t2, t3) _fjsp_add_v2r8(_fjsp_add_v2r8(t0, t1), _fjsp_add_v2r8(t2, t3))
152 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
153 const double * gmx_restrict ptrB)
155 return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA), _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
159 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
161 return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
166 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
167 double * gmx_restrict ptrB,
172 t2 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
173 _fjsp_storel_v2r8(ptrA, xmm1);
174 _fjsp_storel_v2r8(ptrB, t2);
178 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
180 _fjsp_storel_v2r8(ptrA, xmm1);
184 /* Similar to store, but increments value in memory */
186 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
187 double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
191 t1 = _fjsp_unpackhi_v2r8(xmm1, xmm1);
192 xmm1 = _fjsp_add_v2r8(xmm1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA));
193 t1 = _fjsp_add_v2r8(t1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB));
194 _fjsp_storel_v2r8(ptrA, xmm1);
195 _fjsp_storel_v2r8(ptrB, t1);
199 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
203 tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
204 tmp = _fjsp_add_v2r8(tmp, xmm1);
205 _fjsp_storel_v2r8(ptrA, tmp);
210 static gmx_inline void
211 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
212 const double * gmx_restrict p2,
213 _fjsp_v2r8 * gmx_restrict c6,
214 _fjsp_v2r8 * gmx_restrict c12)
216 _fjsp_v2r8 t1, t2, t3;
218 /* The c6/c12 array should be aligned */
219 t1 = _fjsp_load_v2r8(p1);
220 t2 = _fjsp_load_v2r8(p2);
221 *c6 = _fjsp_unpacklo_v2r8(t1, t2);
222 *c12 = _fjsp_unpackhi_v2r8(t1, t2);
225 static gmx_inline void
226 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
227 _fjsp_v2r8 * gmx_restrict c6,
228 _fjsp_v2r8 * gmx_restrict c12)
230 *c6 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
231 *c12 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
235 static gmx_inline void
236 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
237 const double * gmx_restrict xyz,
238 _fjsp_v2r8 * gmx_restrict x1,
239 _fjsp_v2r8 * gmx_restrict y1,
240 _fjsp_v2r8 * gmx_restrict z1)
242 _fjsp_v2r8 mem_xy, mem_z, mem_sxy, mem_sz;
244 mem_xy = _fjsp_load_v2r8(xyz);
245 mem_z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+2);
246 mem_sxy = _fjsp_load_v2r8(xyz_shift);
247 mem_sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
249 mem_xy = _fjsp_add_v2r8(mem_xy, mem_sxy);
250 mem_z = _fjsp_add_v2r8(mem_z, mem_sz);
252 *x1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(0, 0));
253 *y1 = _fjsp_shuffle_v2r8(mem_xy, mem_xy, GMX_FJSP_SHUFFLE2(1, 1));
254 *z1 = _fjsp_shuffle_v2r8(mem_z, mem_z, GMX_FJSP_SHUFFLE2(0, 0));
258 static gmx_inline void
259 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
260 const double * gmx_restrict xyz,
261 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
262 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
263 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
265 _fjsp_v2r8 t1, t2, t3, t4, t5, sxy, sz, szx, syz;
267 t1 = _fjsp_load_v2r8(xyz);
268 t2 = _fjsp_load_v2r8(xyz+2);
269 t3 = _fjsp_load_v2r8(xyz+4);
270 t4 = _fjsp_load_v2r8(xyz+6);
271 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz+8);
273 sxy = _fjsp_load_v2r8(xyz_shift);
274 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
275 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
276 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
278 t1 = _fjsp_add_v2r8(t1, sxy);
279 t2 = _fjsp_add_v2r8(t2, szx);
280 t3 = _fjsp_add_v2r8(t3, syz);
281 t4 = _fjsp_add_v2r8(t4, sxy);
282 t5 = _fjsp_add_v2r8(t5, sz);
284 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
285 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
286 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
287 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
288 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
289 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
290 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
291 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
292 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
296 static gmx_inline void
297 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
298 const double * gmx_restrict xyz,
299 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
300 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
301 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
302 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
304 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, sxy, sz, szx, syz;
306 t1 = _fjsp_load_v2r8(xyz);
307 t2 = _fjsp_load_v2r8(xyz+2);
308 t3 = _fjsp_load_v2r8(xyz+4);
309 t4 = _fjsp_load_v2r8(xyz+6);
310 t5 = _fjsp_load_v2r8(xyz+8);
311 t6 = _fjsp_load_v2r8(xyz+10);
313 sxy = _fjsp_load_v2r8(xyz_shift);
314 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), xyz_shift+2);
315 szx = _fjsp_shuffle_v2r8(sz, sxy, GMX_FJSP_SHUFFLE2(0, 0));
316 syz = _fjsp_shuffle_v2r8(sxy, sz, GMX_FJSP_SHUFFLE2(0, 1));
318 t1 = _fjsp_add_v2r8(t1, sxy);
319 t2 = _fjsp_add_v2r8(t2, szx);
320 t3 = _fjsp_add_v2r8(t3, syz);
321 t4 = _fjsp_add_v2r8(t4, sxy);
322 t5 = _fjsp_add_v2r8(t5, szx);
323 t6 = _fjsp_add_v2r8(t6, syz);
325 *x1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(0, 0));
326 *y1 = _fjsp_shuffle_v2r8(t1, t1, GMX_FJSP_SHUFFLE2(1, 1));
327 *z1 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(0, 0));
328 *x2 = _fjsp_shuffle_v2r8(t2, t2, GMX_FJSP_SHUFFLE2(1, 1));
329 *y2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(0, 0));
330 *z2 = _fjsp_shuffle_v2r8(t3, t3, GMX_FJSP_SHUFFLE2(1, 1));
331 *x3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(0, 0));
332 *y3 = _fjsp_shuffle_v2r8(t4, t4, GMX_FJSP_SHUFFLE2(1, 1));
333 *z3 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(0, 0));
334 *x4 = _fjsp_shuffle_v2r8(t5, t5, GMX_FJSP_SHUFFLE2(1, 1));
335 *y4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(0, 0));
336 *z4 = _fjsp_shuffle_v2r8(t6, t6, GMX_FJSP_SHUFFLE2(1, 1));
341 static gmx_inline void
342 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
343 _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
345 *x = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
346 *y = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
347 *z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
350 static gmx_inline void
351 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
352 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
353 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
354 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
356 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
357 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
358 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
359 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
360 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
361 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
362 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
363 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
364 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
367 static gmx_inline void
368 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
369 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
370 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
371 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
372 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
374 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1);
375 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+1);
376 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+2);
377 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+3);
378 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+4);
379 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+5);
380 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+6);
381 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+7);
382 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+8);
383 *x4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+9);
384 *y4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+10);
385 *z4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), p1+11);
389 static gmx_inline void
390 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
391 const double * gmx_restrict ptrB,
392 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
394 _fjsp_v2r8 t1, t2, t3, t4;
395 t1 = _fjsp_load_v2r8(ptrA);
396 t2 = _fjsp_load_v2r8(ptrB);
397 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
398 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
399 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
402 *z1 = _fjsp_unpacklo_v2r8(t3, t4);
405 static gmx_inline void
406 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
407 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
408 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
409 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
411 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
412 t1 = _fjsp_load_v2r8(ptrA);
413 t2 = _fjsp_load_v2r8(ptrB);
414 t3 = _fjsp_load_v2r8(ptrA+2);
415 t4 = _fjsp_load_v2r8(ptrB+2);
416 t5 = _fjsp_load_v2r8(ptrA+4);
417 t6 = _fjsp_load_v2r8(ptrB+4);
418 t7 = _fjsp_load_v2r8(ptrA+6);
419 t8 = _fjsp_load_v2r8(ptrB+6);
420 t9 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
421 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
422 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
423 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
424 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
425 GMX_FJSP_TRANSPOSE2_V2R8(t7, t8);
434 *z3 = _fjsp_unpacklo_v2r8(t9, t10);
438 static gmx_inline void
439 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
440 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
441 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
442 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
443 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
445 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
446 t1 = _fjsp_load_v2r8(ptrA);
447 t2 = _fjsp_load_v2r8(ptrB);
448 t3 = _fjsp_load_v2r8(ptrA+2);
449 t4 = _fjsp_load_v2r8(ptrB+2);
450 t5 = _fjsp_load_v2r8(ptrA+4);
451 t6 = _fjsp_load_v2r8(ptrB+4);
452 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
453 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
454 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
461 t1 = _fjsp_load_v2r8(ptrA+6);
462 t2 = _fjsp_load_v2r8(ptrB+6);
463 t3 = _fjsp_load_v2r8(ptrA+8);
464 t4 = _fjsp_load_v2r8(ptrB+8);
465 t5 = _fjsp_load_v2r8(ptrA+10);
466 t6 = _fjsp_load_v2r8(ptrB+10);
467 GMX_FJSP_TRANSPOSE2_V2R8(t1, t2);
468 GMX_FJSP_TRANSPOSE2_V2R8(t3, t4);
469 GMX_FJSP_TRANSPOSE2_V2R8(t5, t6);
480 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
481 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
483 _fjsp_v2r8 t1, t2, t3;
485 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
486 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
487 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
489 t1 = _fjsp_sub_v2r8(t1, x1);
490 t2 = _fjsp_sub_v2r8(t2, y1);
491 t3 = _fjsp_sub_v2r8(t3, z1);
492 _fjsp_storel_v2r8(ptrA, t1);
493 _fjsp_storel_v2r8(ptrA+1, t2);
494 _fjsp_storel_v2r8(ptrA+2, t3);
498 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
499 _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
501 _fjsp_v2r8 t1, t2, t3;
503 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA);
504 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+1);
505 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
507 t1 = _fjsp_nmsub_v2r8(fscal, dx1, t1);
508 t2 = _fjsp_nmsub_v2r8(fscal, dy1, t2);
509 t3 = _fjsp_nmsub_v2r8(fscal, dz1, t3);
510 _fjsp_storel_v2r8(ptrA, t1);
511 _fjsp_storel_v2r8(ptrA+1, t2);
512 _fjsp_storel_v2r8(ptrA+2, t3);
517 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
518 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
519 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
520 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
522 _fjsp_v2r8 t1, t2, t3, t4, t5;
524 t1 = _fjsp_load_v2r8(ptrA);
525 t2 = _fjsp_load_v2r8(ptrA+2);
526 t3 = _fjsp_load_v2r8(ptrA+4);
527 t4 = _fjsp_load_v2r8(ptrA+6);
528 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
530 x1 = _fjsp_unpacklo_v2r8(x1, y1);
531 z1 = _fjsp_unpacklo_v2r8(z1, x2);
532 y2 = _fjsp_unpacklo_v2r8(y2, z2);
533 x3 = _fjsp_unpacklo_v2r8(x3, y3);
534 /* nothing to be done for z3 */
536 t1 = _fjsp_sub_v2r8(t1, x1);
537 t2 = _fjsp_sub_v2r8(t2, z1);
538 t3 = _fjsp_sub_v2r8(t3, y2);
539 t4 = _fjsp_sub_v2r8(t4, x3);
540 t5 = _fjsp_sub_v2r8(t5, z3);
541 _fjsp_storel_v2r8(ptrA, t1);
542 _fjsp_storeh_v2r8(ptrA+1, t1);
543 _fjsp_storel_v2r8(ptrA+2, t2);
544 _fjsp_storeh_v2r8(ptrA+3, t2);
545 _fjsp_storel_v2r8(ptrA+4, t3);
546 _fjsp_storeh_v2r8(ptrA+5, t3);
547 _fjsp_storel_v2r8(ptrA+6, t4);
548 _fjsp_storeh_v2r8(ptrA+7, t4);
549 _fjsp_storel_v2r8(ptrA+8, t5);
554 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
555 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
556 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
557 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
558 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
560 _fjsp_v2r8 t1, t2, t3, t4, t5, t6;
562 t1 = _fjsp_load_v2r8(ptrA);
563 t2 = _fjsp_load_v2r8(ptrA+2);
564 t3 = _fjsp_load_v2r8(ptrA+4);
565 t4 = _fjsp_load_v2r8(ptrA+6);
566 t5 = _fjsp_load_v2r8(ptrA+8);
567 t6 = _fjsp_load_v2r8(ptrA+10);
569 x1 = _fjsp_unpacklo_v2r8(x1, y1);
570 z1 = _fjsp_unpacklo_v2r8(z1, x2);
571 y2 = _fjsp_unpacklo_v2r8(y2, z2);
572 x3 = _fjsp_unpacklo_v2r8(x3, y3);
573 z3 = _fjsp_unpacklo_v2r8(z3, x4);
574 y4 = _fjsp_unpacklo_v2r8(y4, z4);
576 _fjsp_storel_v2r8(ptrA, _fjsp_sub_v2r8( t1, x1 ));
577 _fjsp_storeh_v2r8(ptrA+1, _fjsp_sub_v2r8( t1, x1 ));
578 _fjsp_storel_v2r8(ptrA+2, _fjsp_sub_v2r8( t2, z1 ));
579 _fjsp_storeh_v2r8(ptrA+3, _fjsp_sub_v2r8( t2, z1 ));
580 _fjsp_storel_v2r8(ptrA+4, _fjsp_sub_v2r8( t3, y2 ));
581 _fjsp_storeh_v2r8(ptrA+5, _fjsp_sub_v2r8( t3, y2 ));
582 _fjsp_storel_v2r8(ptrA+6, _fjsp_sub_v2r8( t4, x3 ));
583 _fjsp_storeh_v2r8(ptrA+7, _fjsp_sub_v2r8( t4, x3 ));
584 _fjsp_storel_v2r8(ptrA+8, _fjsp_sub_v2r8( t5, z3 ));
585 _fjsp_storeh_v2r8(ptrA+9, _fjsp_sub_v2r8( t5, z3 ));
586 _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6, y4 ));
587 _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6, y4 ));
591 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
592 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
594 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7;
596 t1 = _fjsp_load_v2r8(ptrA);
597 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
598 t3 = _fjsp_load_v2r8(ptrB);
599 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
601 t5 = _fjsp_unpacklo_v2r8(x1, y1);
602 t6 = _fjsp_unpackhi_v2r8(x1, y1);
603 t7 = _fjsp_unpackhi_v2r8(z1, z1);
605 t1 = _fjsp_sub_v2r8(t1, t5);
606 t2 = _fjsp_sub_v2r8(t2, z1);
608 t3 = _fjsp_sub_v2r8(t3, t6);
609 t4 = _fjsp_sub_v2r8(t4, t7);
611 _fjsp_storel_v2r8(ptrA, t1);
612 _fjsp_storeh_v2r8(ptrA+1, t1);
613 _fjsp_storel_v2r8(ptrA+2, t2);
614 _fjsp_storel_v2r8(ptrB, t3);
615 _fjsp_storeh_v2r8(ptrB+1, t3);
616 _fjsp_storel_v2r8(ptrB+2, t4);
621 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
622 _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
624 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, fscalA, fscalB;
626 t1 = _fjsp_load_v2r8(ptrA);
627 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+2);
628 t3 = _fjsp_load_v2r8(ptrB);
629 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+2);
630 fscalA = _fjsp_unpacklo_v2r8(fscal, fscal);
631 fscalB = _fjsp_unpackhi_v2r8(fscal, fscal);
633 t5 = _fjsp_unpacklo_v2r8(dx1, dy1);
634 t6 = _fjsp_unpackhi_v2r8(dx1, dy1);
635 t7 = _fjsp_unpackhi_v2r8(dz1, dz1);
637 t1 = _fjsp_nmsub_v2r8(fscalA, t5, t1);
638 t2 = _fjsp_nmsub_v2r8(fscalA, dz1, t2);
640 t3 = _fjsp_nmsub_v2r8(fscalB, t6, t3);
641 t4 = _fjsp_nmsub_v2r8(fscalB, t7, t4);
643 _fjsp_storel_v2r8(ptrA, t1);
644 _fjsp_storeh_v2r8(ptrA+1, t1);
645 _fjsp_storel_v2r8(ptrA+2, t2);
646 _fjsp_storel_v2r8(ptrB, t3);
647 _fjsp_storeh_v2r8(ptrB+1, t3);
648 _fjsp_storel_v2r8(ptrB+2, t4);
653 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
654 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
655 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
656 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
658 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
659 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI;
661 t1 = _fjsp_load_v2r8(ptrA);
662 t2 = _fjsp_load_v2r8(ptrA+2);
663 t3 = _fjsp_load_v2r8(ptrA+4);
664 t4 = _fjsp_load_v2r8(ptrA+6);
665 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA+8);
666 t6 = _fjsp_load_v2r8(ptrB);
667 t7 = _fjsp_load_v2r8(ptrB+2);
668 t8 = _fjsp_load_v2r8(ptrB+4);
669 t9 = _fjsp_load_v2r8(ptrB+6);
670 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB+8);
672 tA = _fjsp_unpacklo_v2r8(x1, y1);
673 tB = _fjsp_unpackhi_v2r8(x1, y1);
674 tC = _fjsp_unpacklo_v2r8(z1, x2);
675 tD = _fjsp_unpackhi_v2r8(z1, x2);
676 tE = _fjsp_unpacklo_v2r8(y2, z2);
677 tF = _fjsp_unpackhi_v2r8(y2, z2);
678 tG = _fjsp_unpacklo_v2r8(x3, y3);
679 tH = _fjsp_unpackhi_v2r8(x3, y3);
680 tI = _fjsp_unpackhi_v2r8(z3, z3);
682 t1 = _fjsp_sub_v2r8(t1, tA);
683 t2 = _fjsp_sub_v2r8(t2, tC);
684 t3 = _fjsp_sub_v2r8(t3, tE);
685 t4 = _fjsp_sub_v2r8(t4, tG);
686 t5 = _fjsp_sub_v2r8(t5, z3);
688 t6 = _fjsp_sub_v2r8(t6, tB);
689 t7 = _fjsp_sub_v2r8(t7, tD);
690 t8 = _fjsp_sub_v2r8(t8, tF);
691 t9 = _fjsp_sub_v2r8(t9, tH);
692 t10 = _fjsp_sub_v2r8(t10, tI);
694 _fjsp_storel_v2r8(ptrA, t1);
695 _fjsp_storeh_v2r8(ptrA+1, t1);
696 _fjsp_storel_v2r8(ptrA+2, t2);
697 _fjsp_storeh_v2r8(ptrA+3, t2);
698 _fjsp_storel_v2r8(ptrA+4, t3);
699 _fjsp_storeh_v2r8(ptrA+5, t3);
700 _fjsp_storel_v2r8(ptrA+6, t4);
701 _fjsp_storeh_v2r8(ptrA+7, t4);
702 _fjsp_storel_v2r8(ptrA+8, t5);
703 _fjsp_storel_v2r8(ptrB, t6);
704 _fjsp_storeh_v2r8(ptrB+1, t6);
705 _fjsp_storel_v2r8(ptrB+2, t7);
706 _fjsp_storeh_v2r8(ptrB+3, t7);
707 _fjsp_storel_v2r8(ptrB+4, t8);
708 _fjsp_storeh_v2r8(ptrB+5, t8);
709 _fjsp_storel_v2r8(ptrB+6, t9);
710 _fjsp_storeh_v2r8(ptrB+7, t9);
711 _fjsp_storel_v2r8(ptrB+8, t10);
716 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
717 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
718 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
719 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
720 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
722 _fjsp_v2r8 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
723 _fjsp_v2r8 tA, tB, tC, tD, tE, tF, tG, tH, tI, tJ, tK, tL;
725 t1 = _fjsp_load_v2r8(ptrA);
726 t2 = _fjsp_load_v2r8(ptrA+2);
727 t3 = _fjsp_load_v2r8(ptrA+4);
728 t4 = _fjsp_load_v2r8(ptrA+6);
729 t5 = _fjsp_load_v2r8(ptrA+8);
730 t6 = _fjsp_load_v2r8(ptrA+10);
731 t7 = _fjsp_load_v2r8(ptrB);
732 t8 = _fjsp_load_v2r8(ptrB+2);
733 t9 = _fjsp_load_v2r8(ptrB+4);
734 t10 = _fjsp_load_v2r8(ptrB+6);
735 t11 = _fjsp_load_v2r8(ptrB+8);
736 t12 = _fjsp_load_v2r8(ptrB+10);
738 tA = _fjsp_unpacklo_v2r8(x1, y1);
739 tB = _fjsp_unpackhi_v2r8(x1, y1);
740 tC = _fjsp_unpacklo_v2r8(z1, x2);
741 tD = _fjsp_unpackhi_v2r8(z1, x2);
742 tE = _fjsp_unpacklo_v2r8(y2, z2);
743 tF = _fjsp_unpackhi_v2r8(y2, z2);
744 tG = _fjsp_unpacklo_v2r8(x3, y3);
745 tH = _fjsp_unpackhi_v2r8(x3, y3);
746 tI = _fjsp_unpacklo_v2r8(z3, x4);
747 tJ = _fjsp_unpackhi_v2r8(z3, x4);
748 tK = _fjsp_unpacklo_v2r8(y4, z4);
749 tL = _fjsp_unpackhi_v2r8(y4, z4);
751 t1 = _fjsp_sub_v2r8(t1, tA);
752 t2 = _fjsp_sub_v2r8(t2, tC);
753 t3 = _fjsp_sub_v2r8(t3, tE);
754 t4 = _fjsp_sub_v2r8(t4, tG);
755 t5 = _fjsp_sub_v2r8(t5, tI);
756 t6 = _fjsp_sub_v2r8(t6, tK);
758 t7 = _fjsp_sub_v2r8(t7, tB);
759 t8 = _fjsp_sub_v2r8(t8, tD);
760 t9 = _fjsp_sub_v2r8(t9, tF);
761 t10 = _fjsp_sub_v2r8(t10, tH);
762 t11 = _fjsp_sub_v2r8(t11, tJ);
763 t12 = _fjsp_sub_v2r8(t12, tL);
765 _fjsp_storel_v2r8(ptrA, t1);
766 _fjsp_storeh_v2r8(ptrA+1, t1);
767 _fjsp_storel_v2r8(ptrA+2, t2);
768 _fjsp_storeh_v2r8(ptrA+3, t2);
769 _fjsp_storel_v2r8(ptrA+4, t3);
770 _fjsp_storeh_v2r8(ptrA+5, t3);
771 _fjsp_storel_v2r8(ptrA+6, t4);
772 _fjsp_storeh_v2r8(ptrA+7, t4);
773 _fjsp_storel_v2r8(ptrA+8, t5);
774 _fjsp_storeh_v2r8(ptrA+9, t5);
775 _fjsp_storel_v2r8(ptrA+10, t6);
776 _fjsp_storeh_v2r8(ptrA+11, t6);
777 _fjsp_storel_v2r8(ptrB, t7);
778 _fjsp_storeh_v2r8(ptrB+1, t7);
779 _fjsp_storel_v2r8(ptrB+2, t8);
780 _fjsp_storeh_v2r8(ptrB+3, t8);
781 _fjsp_storel_v2r8(ptrB+4, t9);
782 _fjsp_storeh_v2r8(ptrB+5, t9);
783 _fjsp_storel_v2r8(ptrB+6, t10);
784 _fjsp_storeh_v2r8(ptrB+7, t10);
785 _fjsp_storel_v2r8(ptrB+8, t11);
786 _fjsp_storeh_v2r8(ptrB+9, t11);
787 _fjsp_storel_v2r8(ptrB+10, t12);
788 _fjsp_storeh_v2r8(ptrB+11, t12);
793 static gmx_inline void
794 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
795 double * gmx_restrict fptr,
796 double * gmx_restrict fshiftptr)
798 __m128d t1, t2, t3, t4;
802 fix1 = _fjsp_unpacklo_v2r8(fix1, fiy1); /* y0 x0 */
803 fiy1 = _fjsp_unpackhi_v2r8(t1, fiy1); /* y1 x1 */
805 fix1 = _fjsp_add_v2r8(fix1, fiy1);
806 fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1, fiz1 ));
808 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
809 _fjsp_storel_v2r8( fptr, t4 );
810 _fjsp_storeh_v2r8( fptr+1, t4 );
811 _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+2), fiz1 ));
813 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
814 _fjsp_storel_v2r8( fshiftptr, t4 );
815 _fjsp_storeh_v2r8( fshiftptr+1, t4 );
816 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
819 static gmx_inline void
820 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
821 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
822 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
823 double * gmx_restrict fptr,
824 double * gmx_restrict fshiftptr)
826 __m128d t1, t2, t3, t4, t5, t6;
829 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
830 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
831 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
833 fix3 = _fjsp_unpacklo_v2r8(fix3, fiy3); /* y0 x0 */
834 fiy3 = _fjsp_unpackhi_v2r8(t1, fiy3); /* y1 x1 */
836 fix1 = _fjsp_add_v2r8(fix1, fiy1);
837 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
838 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
840 fix3 = _fjsp_add_v2r8(fix3, fiy3);
841 fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3, fiz3));
843 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
844 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
845 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
846 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
848 _fjsp_storel_v2r8( fptr, t3 );
849 _fjsp_storeh_v2r8( fptr+1, t3 );
850 _fjsp_storel_v2r8( fptr+2, t4 );
851 _fjsp_storeh_v2r8( fptr+3, t4 );
852 _fjsp_storel_v2r8( fptr+4, t5 );
853 _fjsp_storeh_v2r8( fptr+5, t5 );
854 _fjsp_storel_v2r8( fptr+6, t6 );
855 _fjsp_storeh_v2r8( fptr+7, t6 );
856 _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fptr+8), fiz3 ));
858 fix1 = _fjsp_add_v2r8(fix1, fix3);
859 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
860 fix1 = _fjsp_add_v2r8(fix1, t1); /* x and y sums */
862 t2 = _fjsp_shuffle_v2r8(fiy2, fiy2, GMX_FJSP_SHUFFLE2(1, 1));
863 fiz1 = _fjsp_add_v2r8(fiz1, fiz3);
864 fiz1 = _fjsp_add_v2r8(fiz1, t2); /* z sum */
866 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
867 _fjsp_storel_v2r8( fshiftptr, t3 );
868 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
869 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
873 static gmx_inline void
874 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
875 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
876 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
877 _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
878 double * gmx_restrict fptr,
879 double * gmx_restrict fshiftptr)
881 __m128d t1, t2, t3, t4, t5, t6, t7, t8;
884 GMX_FJSP_TRANSPOSE2_V2R8(fix1, fiy1);
885 GMX_FJSP_TRANSPOSE2_V2R8(fiz1, fix2);
886 GMX_FJSP_TRANSPOSE2_V2R8(fiy2, fiz2);
887 GMX_FJSP_TRANSPOSE2_V2R8(fix3, fiy3);
888 GMX_FJSP_TRANSPOSE2_V2R8(fiz3, fix4);
889 GMX_FJSP_TRANSPOSE2_V2R8(fiy4, fiz4);
891 fix1 = _fjsp_add_v2r8(fix1, fiy1);
892 fiz1 = _fjsp_add_v2r8(fiz1, fix2);
893 fiy2 = _fjsp_add_v2r8(fiy2, fiz2);
894 fix3 = _fjsp_add_v2r8(fix3, fiy3);
895 fiz3 = _fjsp_add_v2r8(fiz3, fix4);
896 fiy4 = _fjsp_add_v2r8(fiy4, fiz4);
898 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
899 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
900 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
901 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
902 t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8), fiz3 );
903 t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
904 _fjsp_storel_v2r8( fptr, t3 );
905 _fjsp_storeh_v2r8( fptr+1, t3 );
906 _fjsp_storel_v2r8( fptr+2, t4 );
907 _fjsp_storeh_v2r8( fptr+3, t4 );
908 _fjsp_storel_v2r8( fptr+4, t5 );
909 _fjsp_storeh_v2r8( fptr+5, t5 );
910 _fjsp_storel_v2r8( fptr+6, t6 );
911 _fjsp_storeh_v2r8( fptr+7, t6 );
912 _fjsp_storel_v2r8( fptr+8, t7 );
913 _fjsp_storeh_v2r8( fptr+9, t7 );
914 _fjsp_storel_v2r8( fptr+10, t8 );
915 _fjsp_storeh_v2r8( fptr+11, t8 );
917 t1 = _fjsp_shuffle_v2r8(fiz1, fiy2, GMX_FJSP_SHUFFLE2(0, 1));
918 fix1 = _fjsp_add_v2r8(fix1, t1);
919 t2 = _fjsp_shuffle_v2r8(fiz3, fiy4, GMX_FJSP_SHUFFLE2(0, 1));
920 fix3 = _fjsp_add_v2r8(fix3, t2);
921 fix1 = _fjsp_add_v2r8(fix1, fix3); /* x and y sums */
923 fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2, fiy2));
924 fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4, fiy4));
925 fiz1 = _fjsp_add_v2r8(fiz1, fiz3); /* z sum */
927 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
928 _fjsp_storel_v2r8( fshiftptr, t3 );
929 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
930 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), fshiftptr+2), fiz1 ));
935 static gmx_inline void
936 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
938 pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1, pot1));
939 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
942 static gmx_inline void
943 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
944 _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
946 GMX_FJSP_TRANSPOSE2_V2R8(pot1, pot2);
947 pot1 = _fjsp_add_v2r8(pot1, pot2);
948 pot2 = _fjsp_unpackhi_v2r8(pot1, pot1);
950 _fjsp_storel_v2r8(ptrA, _fjsp_add_v2r8(pot1, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrA)));
951 _fjsp_storel_v2r8(ptrB, _fjsp_add_v2r8(pot2, _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(), ptrB)));
955 #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */