2 * This source code is part of
6 * Copyright (c) 2011-2012, The GROMACS Development Team
8 * Gromacs is a library for molecular simulation and trajectory analysis,
9 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
10 * a full list of developers and information, check out http://www.gromacs.org
12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option) any
16 * As a special exception, you may use this file as part of a free software
17 * library without restriction. Specifically, if other files instantiate
18 * templates or use macros or inline functions from this file, or you compile
19 * this file and link it with other files to produce an executable, this
20 * file does not by itself cause the resulting executable to be covered by
21 * the GNU Lesser General Public License.
23 * In plain-speak: do not worry about classes/macros/templates either - only
24 * changes to the library have to be LGPL, not an application linking with it.
26 * To help fund GROMACS development, we humbly ask that you cite
27 * the papers people have written on it - you can find them on the website!
29 #ifndef _kernelutil_sparc64_hpc_ace_double_h_
30 #define _kernelutil_sparc64_hpc_ace_double_h_
32 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
33 #include "emmintrin.h"
35 #define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
37 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) { \
38 _fjsp_v2r8 __gmx_t1 = row0; \
39 row0 = _fjsp_unpacklo_v2r8(row0,row1); \
40 row1 = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
45 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
49 _fjsp_storel_v2r8(&lo,a);
50 _fjsp_storeh_v2r8(&hi,a);
51 printf("%s: %g %g\n",s,lo,hi);
56 gmx_fjsp_set1_v2r8(double d)
58 return _fjsp_set_v2r8(d,d);
62 gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
64 return gmx_fjsp_set1_v2r8(*ptr);
69 gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
78 a = _fjsp_cmplt_v2r8(a,b);
79 a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
80 _fjsp_storel_v2r8(&(conv.d),a);
85 static gmx_inline _fjsp_v2r8
86 gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
88 const _fjsp_v2r8 half = gmx_fjsp_set1_v2r8(0.5);
89 const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
90 _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
92 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
93 /* The HPC-ACE instruction set is only available in double precision, while
94 * single precision is typically sufficient for Gromacs. If you define
95 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
96 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
97 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
99 #ifndef GMX_RELAXED_DOUBLE_PRECISION
100 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
102 return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
107 static gmx_inline _fjsp_v2r8
108 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
110 const _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
111 __m128d lu = _fjsp_rcpa_v2r8(x);
113 /* Perform three N-R steps for double precision */
114 lu = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
115 /* The HPC-ACE instruction set is only available in double precision, while
116 * single precision is typically sufficient for Gromacs. If you define
117 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
118 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
119 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
121 #ifndef GMX_RELAXED_DOUBLE_PRECISION
122 lu = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
124 return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
128 static gmx_inline _fjsp_v2r8
129 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
131 return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
134 /* Normal sum of four ymm registers */
135 #define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3) _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
142 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
143 const double * gmx_restrict ptrB)
145 return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
149 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
151 return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
156 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
157 double * gmx_restrict ptrB,
162 t2 = _fjsp_unpackhi_v2r8(xmm1,xmm1);
163 _fjsp_storel_v2r8(ptrA,xmm1);
164 _fjsp_storel_v2r8(ptrB,t2);
168 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
170 _fjsp_storel_v2r8(ptrA,xmm1);
174 /* Similar to store, but increments value in memory */
176 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
177 double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
181 t1 = _fjsp_unpackhi_v2r8(xmm1,xmm1);
182 xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
183 t1 = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
184 _fjsp_storel_v2r8(ptrA,xmm1);
185 _fjsp_storel_v2r8(ptrB,t1);
189 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
193 tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
194 tmp = _fjsp_add_v2r8(tmp,xmm1);
195 _fjsp_storel_v2r8(ptrA,tmp);
200 static gmx_inline void
201 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
202 const double * gmx_restrict p2,
203 _fjsp_v2r8 * gmx_restrict c6,
204 _fjsp_v2r8 * gmx_restrict c12)
208 /* The c6/c12 array should be aligned */
209 t1 = _fjsp_load_v2r8(p1);
210 t2 = _fjsp_load_v2r8(p2);
211 *c6 = _fjsp_unpacklo_v2r8(t1,t2);
212 *c12 = _fjsp_unpackhi_v2r8(t1,t2);
215 static gmx_inline void
216 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
217 _fjsp_v2r8 * gmx_restrict c6,
218 _fjsp_v2r8 * gmx_restrict c12)
220 *c6 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
221 *c12 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
225 static gmx_inline void
226 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
227 const double * gmx_restrict xyz,
228 _fjsp_v2r8 * gmx_restrict x1,
229 _fjsp_v2r8 * gmx_restrict y1,
230 _fjsp_v2r8 * gmx_restrict z1)
232 _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
234 mem_xy = _fjsp_load_v2r8(xyz);
235 mem_z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
236 mem_sxy = _fjsp_load_v2r8(xyz_shift);
237 mem_sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
239 mem_xy = _fjsp_add_v2r8(mem_xy,mem_sxy);
240 mem_z = _fjsp_add_v2r8(mem_z,mem_sz);
242 *x1 = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
243 *y1 = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
244 *z1 = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
248 static gmx_inline void
249 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
250 const double * gmx_restrict xyz,
251 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
252 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
253 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
255 _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
257 t1 = _fjsp_load_v2r8(xyz);
258 t2 = _fjsp_load_v2r8(xyz+2);
259 t3 = _fjsp_load_v2r8(xyz+4);
260 t4 = _fjsp_load_v2r8(xyz+6);
261 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
263 sxy = _fjsp_load_v2r8(xyz_shift);
264 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
265 szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
266 syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
268 t1 = _fjsp_add_v2r8(t1,sxy);
269 t2 = _fjsp_add_v2r8(t2,szx);
270 t3 = _fjsp_add_v2r8(t3,syz);
271 t4 = _fjsp_add_v2r8(t4,sxy);
272 t5 = _fjsp_add_v2r8(t5,sz);
274 *x1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
275 *y1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
276 *z1 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
277 *x2 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
278 *y2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
279 *z2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
280 *x3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
281 *y3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
282 *z3 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
286 static gmx_inline void
287 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
288 const double * gmx_restrict xyz,
289 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
290 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
291 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
292 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
294 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
296 t1 = _fjsp_load_v2r8(xyz);
297 t2 = _fjsp_load_v2r8(xyz+2);
298 t3 = _fjsp_load_v2r8(xyz+4);
299 t4 = _fjsp_load_v2r8(xyz+6);
300 t5 = _fjsp_load_v2r8(xyz+8);
301 t6 = _fjsp_load_v2r8(xyz+10);
303 sxy = _fjsp_load_v2r8(xyz_shift);
304 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
305 szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
306 syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
308 t1 = _fjsp_add_v2r8(t1,sxy);
309 t2 = _fjsp_add_v2r8(t2,szx);
310 t3 = _fjsp_add_v2r8(t3,syz);
311 t4 = _fjsp_add_v2r8(t4,sxy);
312 t5 = _fjsp_add_v2r8(t5,szx);
313 t6 = _fjsp_add_v2r8(t6,syz);
315 *x1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
316 *y1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
317 *z1 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
318 *x2 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
319 *y2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
320 *z2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
321 *x3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
322 *y3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
323 *z3 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
324 *x4 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
325 *y4 = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
326 *z4 = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
331 static gmx_inline void
332 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
333 _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
335 *x = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
336 *y = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
337 *z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
340 static gmx_inline void
341 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
342 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
343 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
344 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
346 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
347 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
348 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
349 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
350 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
351 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
352 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
353 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
354 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
357 static gmx_inline void
358 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
359 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
360 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
361 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
362 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
364 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
365 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
366 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
367 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
368 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
369 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
370 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
371 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
372 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
373 *x4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
374 *y4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
375 *z4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
379 static gmx_inline void
380 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
381 const double * gmx_restrict ptrB,
382 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
384 _fjsp_v2r8 t1,t2,t3,t4;
385 t1 = _fjsp_load_v2r8(ptrA);
386 t2 = _fjsp_load_v2r8(ptrB);
387 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
388 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
389 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
392 *z1 = _fjsp_unpacklo_v2r8(t3,t4);
395 static gmx_inline void
396 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
397 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
398 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
399 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
401 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
402 t1 = _fjsp_load_v2r8(ptrA);
403 t2 = _fjsp_load_v2r8(ptrB);
404 t3 = _fjsp_load_v2r8(ptrA+2);
405 t4 = _fjsp_load_v2r8(ptrB+2);
406 t5 = _fjsp_load_v2r8(ptrA+4);
407 t6 = _fjsp_load_v2r8(ptrB+4);
408 t7 = _fjsp_load_v2r8(ptrA+6);
409 t8 = _fjsp_load_v2r8(ptrB+6);
410 t9 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
411 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
412 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
413 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
414 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
415 GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
424 *z3 = _fjsp_unpacklo_v2r8(t9,t10);
428 static gmx_inline void
429 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
430 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
431 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
432 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
433 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
435 _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
436 t1 = _fjsp_load_v2r8(ptrA);
437 t2 = _fjsp_load_v2r8(ptrB);
438 t3 = _fjsp_load_v2r8(ptrA+2);
439 t4 = _fjsp_load_v2r8(ptrB+2);
440 t5 = _fjsp_load_v2r8(ptrA+4);
441 t6 = _fjsp_load_v2r8(ptrB+4);
442 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
443 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
444 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
451 t1 = _fjsp_load_v2r8(ptrA+6);
452 t2 = _fjsp_load_v2r8(ptrB+6);
453 t3 = _fjsp_load_v2r8(ptrA+8);
454 t4 = _fjsp_load_v2r8(ptrB+8);
455 t5 = _fjsp_load_v2r8(ptrA+10);
456 t6 = _fjsp_load_v2r8(ptrB+10);
457 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
458 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
459 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
470 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
471 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
475 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
476 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
477 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
479 t1 = _fjsp_sub_v2r8(t1,x1);
480 t2 = _fjsp_sub_v2r8(t2,y1);
481 t3 = _fjsp_sub_v2r8(t3,z1);
482 _fjsp_storel_v2r8(ptrA,t1);
483 _fjsp_storel_v2r8(ptrA+1,t2);
484 _fjsp_storel_v2r8(ptrA+2,t3);
488 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
489 _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
493 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
494 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
495 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
497 t1 = _fjsp_nmsub_v2r8(fscal,dx1,t1);
498 t2 = _fjsp_nmsub_v2r8(fscal,dy1,t2);
499 t3 = _fjsp_nmsub_v2r8(fscal,dz1,t3);
500 _fjsp_storel_v2r8(ptrA,t1);
501 _fjsp_storel_v2r8(ptrA+1,t2);
502 _fjsp_storel_v2r8(ptrA+2,t3);
507 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
508 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
509 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
510 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
512 _fjsp_v2r8 t1,t2,t3,t4,t5;
514 t1 = _fjsp_load_v2r8(ptrA);
515 t2 = _fjsp_load_v2r8(ptrA+2);
516 t3 = _fjsp_load_v2r8(ptrA+4);
517 t4 = _fjsp_load_v2r8(ptrA+6);
518 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
520 x1 = _fjsp_unpacklo_v2r8(x1,y1);
521 z1 = _fjsp_unpacklo_v2r8(z1,x2);
522 y2 = _fjsp_unpacklo_v2r8(y2,z2);
523 x3 = _fjsp_unpacklo_v2r8(x3,y3);
524 /* nothing to be done for z3 */
526 t1 = _fjsp_sub_v2r8(t1,x1);
527 t2 = _fjsp_sub_v2r8(t2,z1);
528 t3 = _fjsp_sub_v2r8(t3,y2);
529 t4 = _fjsp_sub_v2r8(t4,x3);
530 t5 = _fjsp_sub_v2r8(t5,z3);
531 _fjsp_storel_v2r8(ptrA,t1);
532 _fjsp_storeh_v2r8(ptrA+1,t1);
533 _fjsp_storel_v2r8(ptrA+2,t2);
534 _fjsp_storeh_v2r8(ptrA+3,t2);
535 _fjsp_storel_v2r8(ptrA+4,t3);
536 _fjsp_storeh_v2r8(ptrA+5,t3);
537 _fjsp_storel_v2r8(ptrA+6,t4);
538 _fjsp_storeh_v2r8(ptrA+7,t4);
539 _fjsp_storel_v2r8(ptrA+8,t5);
544 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
545 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
546 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
547 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
548 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
550 _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
552 t1 = _fjsp_load_v2r8(ptrA);
553 t2 = _fjsp_load_v2r8(ptrA+2);
554 t3 = _fjsp_load_v2r8(ptrA+4);
555 t4 = _fjsp_load_v2r8(ptrA+6);
556 t5 = _fjsp_load_v2r8(ptrA+8);
557 t6 = _fjsp_load_v2r8(ptrA+10);
559 x1 = _fjsp_unpacklo_v2r8(x1,y1);
560 z1 = _fjsp_unpacklo_v2r8(z1,x2);
561 y2 = _fjsp_unpacklo_v2r8(y2,z2);
562 x3 = _fjsp_unpacklo_v2r8(x3,y3);
563 z3 = _fjsp_unpacklo_v2r8(z3,x4);
564 y4 = _fjsp_unpacklo_v2r8(y4,z4);
566 _fjsp_storel_v2r8(ptrA, _fjsp_sub_v2r8( t1,x1 ));
567 _fjsp_storeh_v2r8(ptrA+1, _fjsp_sub_v2r8( t1,x1 ));
568 _fjsp_storel_v2r8(ptrA+2, _fjsp_sub_v2r8( t2,z1 ));
569 _fjsp_storeh_v2r8(ptrA+3, _fjsp_sub_v2r8( t2,z1 ));
570 _fjsp_storel_v2r8(ptrA+4, _fjsp_sub_v2r8( t3,y2 ));
571 _fjsp_storeh_v2r8(ptrA+5, _fjsp_sub_v2r8( t3,y2 ));
572 _fjsp_storel_v2r8(ptrA+6, _fjsp_sub_v2r8( t4,x3 ));
573 _fjsp_storeh_v2r8(ptrA+7, _fjsp_sub_v2r8( t4,x3 ));
574 _fjsp_storel_v2r8(ptrA+8, _fjsp_sub_v2r8( t5,z3 ));
575 _fjsp_storeh_v2r8(ptrA+9, _fjsp_sub_v2r8( t5,z3 ));
576 _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
577 _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
581 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
582 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
584 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
586 t1 = _fjsp_load_v2r8(ptrA);
587 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
588 t3 = _fjsp_load_v2r8(ptrB);
589 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
591 t5 = _fjsp_unpacklo_v2r8(x1,y1);
592 t6 = _fjsp_unpackhi_v2r8(x1,y1);
593 t7 = _fjsp_unpackhi_v2r8(z1,z1);
595 t1 = _fjsp_sub_v2r8(t1,t5);
596 t2 = _fjsp_sub_v2r8(t2,z1);
598 t3 = _fjsp_sub_v2r8(t3,t6);
599 t4 = _fjsp_sub_v2r8(t4,t7);
601 _fjsp_storel_v2r8(ptrA,t1);
602 _fjsp_storeh_v2r8(ptrA+1,t1);
603 _fjsp_storel_v2r8(ptrA+2,t2);
604 _fjsp_storel_v2r8(ptrB,t3);
605 _fjsp_storeh_v2r8(ptrB+1,t3);
606 _fjsp_storel_v2r8(ptrB+2,t4);
611 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
612 _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
614 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
616 t1 = _fjsp_load_v2r8(ptrA);
617 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
618 t3 = _fjsp_load_v2r8(ptrB);
619 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
620 fscalA = _fjsp_unpacklo_v2r8(fscal,fscal);
621 fscalB = _fjsp_unpackhi_v2r8(fscal,fscal);
623 t5 = _fjsp_unpacklo_v2r8(dx1,dy1);
624 t6 = _fjsp_unpackhi_v2r8(dx1,dy1);
625 t7 = _fjsp_unpackhi_v2r8(dz1,dz1);
627 t1 = _fjsp_nmsub_v2r8(fscalA,t5,t1);
628 t2 = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
630 t3 = _fjsp_nmsub_v2r8(fscalB,t6,t3);
631 t4 = _fjsp_nmsub_v2r8(fscalB,t7,t4);
633 _fjsp_storel_v2r8(ptrA,t1);
634 _fjsp_storeh_v2r8(ptrA+1,t1);
635 _fjsp_storel_v2r8(ptrA+2,t2);
636 _fjsp_storel_v2r8(ptrB,t3);
637 _fjsp_storeh_v2r8(ptrB+1,t3);
638 _fjsp_storel_v2r8(ptrB+2,t4);
643 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
644 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
645 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
646 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
648 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
649 _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
651 t1 = _fjsp_load_v2r8(ptrA);
652 t2 = _fjsp_load_v2r8(ptrA+2);
653 t3 = _fjsp_load_v2r8(ptrA+4);
654 t4 = _fjsp_load_v2r8(ptrA+6);
655 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
656 t6 = _fjsp_load_v2r8(ptrB);
657 t7 = _fjsp_load_v2r8(ptrB+2);
658 t8 = _fjsp_load_v2r8(ptrB+4);
659 t9 = _fjsp_load_v2r8(ptrB+6);
660 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
662 tA = _fjsp_unpacklo_v2r8(x1,y1);
663 tB = _fjsp_unpackhi_v2r8(x1,y1);
664 tC = _fjsp_unpacklo_v2r8(z1,x2);
665 tD = _fjsp_unpackhi_v2r8(z1,x2);
666 tE = _fjsp_unpacklo_v2r8(y2,z2);
667 tF = _fjsp_unpackhi_v2r8(y2,z2);
668 tG = _fjsp_unpacklo_v2r8(x3,y3);
669 tH = _fjsp_unpackhi_v2r8(x3,y3);
670 tI = _fjsp_unpackhi_v2r8(z3,z3);
672 t1 = _fjsp_sub_v2r8(t1,tA);
673 t2 = _fjsp_sub_v2r8(t2,tC);
674 t3 = _fjsp_sub_v2r8(t3,tE);
675 t4 = _fjsp_sub_v2r8(t4,tG);
676 t5 = _fjsp_sub_v2r8(t5,z3);
678 t6 = _fjsp_sub_v2r8(t6,tB);
679 t7 = _fjsp_sub_v2r8(t7,tD);
680 t8 = _fjsp_sub_v2r8(t8,tF);
681 t9 = _fjsp_sub_v2r8(t9,tH);
682 t10 = _fjsp_sub_v2r8(t10,tI);
684 _fjsp_storel_v2r8(ptrA,t1);
685 _fjsp_storeh_v2r8(ptrA+1,t1);
686 _fjsp_storel_v2r8(ptrA+2,t2);
687 _fjsp_storeh_v2r8(ptrA+3,t2);
688 _fjsp_storel_v2r8(ptrA+4,t3);
689 _fjsp_storeh_v2r8(ptrA+5,t3);
690 _fjsp_storel_v2r8(ptrA+6,t4);
691 _fjsp_storeh_v2r8(ptrA+7,t4);
692 _fjsp_storel_v2r8(ptrA+8,t5);
693 _fjsp_storel_v2r8(ptrB,t6);
694 _fjsp_storeh_v2r8(ptrB+1,t6);
695 _fjsp_storel_v2r8(ptrB+2,t7);
696 _fjsp_storeh_v2r8(ptrB+3,t7);
697 _fjsp_storel_v2r8(ptrB+4,t8);
698 _fjsp_storeh_v2r8(ptrB+5,t8);
699 _fjsp_storel_v2r8(ptrB+6,t9);
700 _fjsp_storeh_v2r8(ptrB+7,t9);
701 _fjsp_storel_v2r8(ptrB+8,t10);
706 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
707 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
708 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
709 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
710 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
712 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
713 _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
715 t1 = _fjsp_load_v2r8(ptrA);
716 t2 = _fjsp_load_v2r8(ptrA+2);
717 t3 = _fjsp_load_v2r8(ptrA+4);
718 t4 = _fjsp_load_v2r8(ptrA+6);
719 t5 = _fjsp_load_v2r8(ptrA+8);
720 t6 = _fjsp_load_v2r8(ptrA+10);
721 t7 = _fjsp_load_v2r8(ptrB);
722 t8 = _fjsp_load_v2r8(ptrB+2);
723 t9 = _fjsp_load_v2r8(ptrB+4);
724 t10 = _fjsp_load_v2r8(ptrB+6);
725 t11 = _fjsp_load_v2r8(ptrB+8);
726 t12 = _fjsp_load_v2r8(ptrB+10);
728 tA = _fjsp_unpacklo_v2r8(x1,y1);
729 tB = _fjsp_unpackhi_v2r8(x1,y1);
730 tC = _fjsp_unpacklo_v2r8(z1,x2);
731 tD = _fjsp_unpackhi_v2r8(z1,x2);
732 tE = _fjsp_unpacklo_v2r8(y2,z2);
733 tF = _fjsp_unpackhi_v2r8(y2,z2);
734 tG = _fjsp_unpacklo_v2r8(x3,y3);
735 tH = _fjsp_unpackhi_v2r8(x3,y3);
736 tI = _fjsp_unpacklo_v2r8(z3,x4);
737 tJ = _fjsp_unpackhi_v2r8(z3,x4);
738 tK = _fjsp_unpacklo_v2r8(y4,z4);
739 tL = _fjsp_unpackhi_v2r8(y4,z4);
741 t1 = _fjsp_sub_v2r8(t1,tA);
742 t2 = _fjsp_sub_v2r8(t2,tC);
743 t3 = _fjsp_sub_v2r8(t3,tE);
744 t4 = _fjsp_sub_v2r8(t4,tG);
745 t5 = _fjsp_sub_v2r8(t5,tI);
746 t6 = _fjsp_sub_v2r8(t6,tK);
748 t7 = _fjsp_sub_v2r8(t7,tB);
749 t8 = _fjsp_sub_v2r8(t8,tD);
750 t9 = _fjsp_sub_v2r8(t9,tF);
751 t10 = _fjsp_sub_v2r8(t10,tH);
752 t11 = _fjsp_sub_v2r8(t11,tJ);
753 t12 = _fjsp_sub_v2r8(t12,tL);
755 _fjsp_storel_v2r8(ptrA, t1);
756 _fjsp_storeh_v2r8(ptrA+1,t1);
757 _fjsp_storel_v2r8(ptrA+2,t2);
758 _fjsp_storeh_v2r8(ptrA+3,t2);
759 _fjsp_storel_v2r8(ptrA+4,t3);
760 _fjsp_storeh_v2r8(ptrA+5,t3);
761 _fjsp_storel_v2r8(ptrA+6,t4);
762 _fjsp_storeh_v2r8(ptrA+7,t4);
763 _fjsp_storel_v2r8(ptrA+8,t5);
764 _fjsp_storeh_v2r8(ptrA+9,t5);
765 _fjsp_storel_v2r8(ptrA+10,t6);
766 _fjsp_storeh_v2r8(ptrA+11,t6);
767 _fjsp_storel_v2r8(ptrB, t7);
768 _fjsp_storeh_v2r8(ptrB+1,t7);
769 _fjsp_storel_v2r8(ptrB+2,t8);
770 _fjsp_storeh_v2r8(ptrB+3,t8);
771 _fjsp_storel_v2r8(ptrB+4,t9);
772 _fjsp_storeh_v2r8(ptrB+5,t9);
773 _fjsp_storel_v2r8(ptrB+6,t10);
774 _fjsp_storeh_v2r8(ptrB+7,t10);
775 _fjsp_storel_v2r8(ptrB+8,t11);
776 _fjsp_storeh_v2r8(ptrB+9,t11);
777 _fjsp_storel_v2r8(ptrB+10,t12);
778 _fjsp_storeh_v2r8(ptrB+11,t12);
783 static gmx_inline void
784 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
785 double * gmx_restrict fptr,
786 double * gmx_restrict fshiftptr)
792 fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
793 fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1); /* y1 x1 */
795 fix1 = _fjsp_add_v2r8(fix1,fiy1);
796 fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
798 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
799 _fjsp_storel_v2r8( fptr, t4 );
800 _fjsp_storeh_v2r8( fptr+1, t4 );
801 _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
803 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
804 _fjsp_storel_v2r8( fshiftptr, t4 );
805 _fjsp_storeh_v2r8( fshiftptr+1, t4 );
806 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
809 static gmx_inline void
810 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
811 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
812 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
813 double * gmx_restrict fptr,
814 double * gmx_restrict fshiftptr)
816 __m128d t1,t2,t3,t4,t5,t6;
819 GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
820 GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
821 GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
823 fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
824 fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3); /* y1 x1 */
826 fix1 = _fjsp_add_v2r8(fix1,fiy1);
827 fiz1 = _fjsp_add_v2r8(fiz1,fix2);
828 fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
830 fix3 = _fjsp_add_v2r8(fix3,fiy3);
831 fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
833 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
834 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
835 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
836 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
838 _fjsp_storel_v2r8( fptr, t3 );
839 _fjsp_storeh_v2r8( fptr+1, t3 );
840 _fjsp_storel_v2r8( fptr+2, t4 );
841 _fjsp_storeh_v2r8( fptr+3, t4 );
842 _fjsp_storel_v2r8( fptr+4, t5 );
843 _fjsp_storeh_v2r8( fptr+5, t5 );
844 _fjsp_storel_v2r8( fptr+6, t6 );
845 _fjsp_storeh_v2r8( fptr+7, t6 );
846 _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
848 fix1 = _fjsp_add_v2r8(fix1,fix3);
849 t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
850 fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
852 t2 = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
853 fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
854 fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
856 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
857 _fjsp_storel_v2r8( fshiftptr, t3 );
858 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
859 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
863 static gmx_inline void
864 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
865 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
866 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
867 _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
868 double * gmx_restrict fptr,
869 double * gmx_restrict fshiftptr)
871 __m128d t1,t2,t3,t4,t5,t6,t7,t8;
874 GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
875 GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
876 GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
877 GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
878 GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
879 GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
881 fix1 = _fjsp_add_v2r8(fix1,fiy1);
882 fiz1 = _fjsp_add_v2r8(fiz1,fix2);
883 fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
884 fix3 = _fjsp_add_v2r8(fix3,fiy3);
885 fiz3 = _fjsp_add_v2r8(fiz3,fix4);
886 fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
888 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
889 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
890 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
891 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
892 t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8), fiz3 );
893 t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
894 _fjsp_storel_v2r8( fptr, t3 );
895 _fjsp_storeh_v2r8( fptr+1, t3 );
896 _fjsp_storel_v2r8( fptr+2, t4 );
897 _fjsp_storeh_v2r8( fptr+3, t4 );
898 _fjsp_storel_v2r8( fptr+4, t5 );
899 _fjsp_storeh_v2r8( fptr+5, t5 );
900 _fjsp_storel_v2r8( fptr+6, t6 );
901 _fjsp_storeh_v2r8( fptr+7, t6 );
902 _fjsp_storel_v2r8( fptr+8, t7 );
903 _fjsp_storeh_v2r8( fptr+9, t7 );
904 _fjsp_storel_v2r8( fptr+10, t8 );
905 _fjsp_storeh_v2r8( fptr+11, t8 );
907 t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
908 fix1 = _fjsp_add_v2r8(fix1,t1);
909 t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
910 fix3 = _fjsp_add_v2r8(fix3,t2);
911 fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
913 fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
914 fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
915 fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
917 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
918 _fjsp_storel_v2r8( fshiftptr, t3 );
919 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
920 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
925 static gmx_inline void
926 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
928 pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
929 _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
932 static gmx_inline void
933 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
934 _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
936 GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
937 pot1 = _fjsp_add_v2r8(pot1,pot2);
938 pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
940 _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
941 _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
945 #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */