2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef _kernelutil_sparc64_hpc_ace_double_h_
36 #define _kernelutil_sparc64_hpc_ace_double_h_
38 /* Fujitsu header borrows the name from SSE2, since some instructions have aliases */
39 #include "emmintrin.h"
41 #define GMX_FJSP_SHUFFLE2(x,y) (((x)<<1) | (y))
43 #define GMX_FJSP_TRANSPOSE2_V2R8(row0, row1) { \
44 _fjsp_v2r8 __gmx_t1 = row0; \
45 row0 = _fjsp_unpacklo_v2r8(row0,row1); \
46 row1 = _fjsp_unpackhi_v2r8(__gmx_t1,row1); \
51 gmx_fjsp_print_v2r8(const char *s, _fjsp_v2r8 a)
55 _fjsp_storel_v2r8(&lo,a);
56 _fjsp_storeh_v2r8(&hi,a);
57 printf("%s: %g %g\n",s,lo,hi);
62 gmx_fjsp_set1_v2r8(double d)
64 return _fjsp_set_v2r8(d,d);
68 gmx_fjsp_load1_v2r8(const double * gmx_restrict ptr)
70 return gmx_fjsp_set1_v2r8(*ptr);
75 gmx_fjsp_any_lt_v2r8(_fjsp_v2r8 a, _fjsp_v2r8 b)
84 a = _fjsp_cmplt_v2r8(a,b);
85 a = _fjsp_or_v2r8(a, _fjsp_unpackhi_v2r8(a,a));
86 _fjsp_storel_v2r8(&(conv.d),a);
91 static gmx_inline _fjsp_v2r8
92 gmx_fjsp_invsqrt_v2r8(_fjsp_v2r8 x)
94 const _fjsp_v2r8 half = gmx_fjsp_set1_v2r8(0.5);
95 const _fjsp_v2r8 three = gmx_fjsp_set1_v2r8(3.0);
96 _fjsp_v2r8 lu = _fjsp_rsqrta_v2r8(x);
98 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
99 /* The HPC-ACE instruction set is only available in double precision, while
100 * single precision is typically sufficient for Gromacs. If you define
101 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
102 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
103 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
105 #ifndef GMX_RELAXED_DOUBLE_PRECISION
106 lu = _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
108 return _fjsp_mul_v2r8(_fjsp_mul_v2r8(half,lu),_fjsp_nmsub_v2r8(_fjsp_mul_v2r8(lu,lu),x,three));
113 static gmx_inline _fjsp_v2r8
114 gmx_fjsp_inv_v2r8(_fjsp_v2r8 x)
116 const _fjsp_v2r8 two = gmx_fjsp_set1_v2r8(2.0);
117 __m128d lu = _fjsp_rcpa_v2r8(x);
119 /* Perform three N-R steps for double precision */
120 lu = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
121 /* The HPC-ACE instruction set is only available in double precision, while
122 * single precision is typically sufficient for Gromacs. If you define
123 * "GMX_RELAXED_DOUBLE_PRECISION" during compile, we stick to two Newton-Raphson
124 * iterations and accept 32bits of accuracy in 1.0/sqrt(x) and 1.0/x, rather than full
125 * double precision (53 bits). This is still clearly higher than single precision (24 bits).
127 #ifndef GMX_RELAXED_DOUBLE_PRECISION
128 lu = _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
130 return _fjsp_mul_v2r8(lu,_fjsp_nmsub_v2r8(lu,x,two));
134 static gmx_inline _fjsp_v2r8
135 gmx_fjsp_calc_rsq_v2r8(_fjsp_v2r8 dx, _fjsp_v2r8 dy, _fjsp_v2r8 dz)
137 return _fjsp_madd_v2r8(dx,dx,_fjsp_madd_v2r8(dy,dy,_fjsp_mul_v2r8(dz,dz)));
140 /* Normal sum of four ymm registers */
141 #define gmx_fjsp_sum4_v2r8(t0,t1,t2,t3) _fjsp_add_v2r8(_fjsp_add_v2r8(t0,t1),_fjsp_add_v2r8(t2,t3))
148 gmx_fjsp_load_2real_swizzle_v2r8(const double * gmx_restrict ptrA,
149 const double * gmx_restrict ptrB)
151 return _fjsp_unpacklo_v2r8(_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA),_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
155 gmx_fjsp_load_1real_v2r8(const double * gmx_restrict ptrA)
157 return _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
162 gmx_fjsp_store_2real_swizzle_v2r8(double * gmx_restrict ptrA,
163 double * gmx_restrict ptrB,
168 t2 = _fjsp_unpackhi_v2r8(xmm1,xmm1);
169 _fjsp_storel_v2r8(ptrA,xmm1);
170 _fjsp_storel_v2r8(ptrB,t2);
174 gmx_fjsp_store_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
176 _fjsp_storel_v2r8(ptrA,xmm1);
180 /* Similar to store, but increments value in memory */
182 gmx_fjsp_increment_2real_swizzle_v2r8(double * gmx_restrict ptrA,
183 double * gmx_restrict ptrB, _fjsp_v2r8 xmm1)
187 t1 = _fjsp_unpackhi_v2r8(xmm1,xmm1);
188 xmm1 = _fjsp_add_v2r8(xmm1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA));
189 t1 = _fjsp_add_v2r8(t1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB));
190 _fjsp_storel_v2r8(ptrA,xmm1);
191 _fjsp_storel_v2r8(ptrB,t1);
195 gmx_fjsp_increment_1real_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 xmm1)
199 tmp = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
200 tmp = _fjsp_add_v2r8(tmp,xmm1);
201 _fjsp_storel_v2r8(ptrA,tmp);
206 static gmx_inline void
207 gmx_fjsp_load_2pair_swizzle_v2r8(const double * gmx_restrict p1,
208 const double * gmx_restrict p2,
209 _fjsp_v2r8 * gmx_restrict c6,
210 _fjsp_v2r8 * gmx_restrict c12)
214 /* The c6/c12 array should be aligned */
215 t1 = _fjsp_load_v2r8(p1);
216 t2 = _fjsp_load_v2r8(p2);
217 *c6 = _fjsp_unpacklo_v2r8(t1,t2);
218 *c12 = _fjsp_unpackhi_v2r8(t1,t2);
221 static gmx_inline void
222 gmx_fjsp_load_1pair_swizzle_v2r8(const double * gmx_restrict p1,
223 _fjsp_v2r8 * gmx_restrict c6,
224 _fjsp_v2r8 * gmx_restrict c12)
226 *c6 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
227 *c12 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
231 static gmx_inline void
232 gmx_fjsp_load_shift_and_1rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
233 const double * gmx_restrict xyz,
234 _fjsp_v2r8 * gmx_restrict x1,
235 _fjsp_v2r8 * gmx_restrict y1,
236 _fjsp_v2r8 * gmx_restrict z1)
238 _fjsp_v2r8 mem_xy,mem_z,mem_sxy,mem_sz;
240 mem_xy = _fjsp_load_v2r8(xyz);
241 mem_z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+2);
242 mem_sxy = _fjsp_load_v2r8(xyz_shift);
243 mem_sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
245 mem_xy = _fjsp_add_v2r8(mem_xy,mem_sxy);
246 mem_z = _fjsp_add_v2r8(mem_z,mem_sz);
248 *x1 = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(0,0));
249 *y1 = _fjsp_shuffle_v2r8(mem_xy,mem_xy,GMX_FJSP_SHUFFLE2(1,1));
250 *z1 = _fjsp_shuffle_v2r8(mem_z,mem_z,GMX_FJSP_SHUFFLE2(0,0));
254 static gmx_inline void
255 gmx_fjsp_load_shift_and_3rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
256 const double * gmx_restrict xyz,
257 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
258 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
259 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
261 _fjsp_v2r8 t1,t2,t3,t4,t5,sxy,sz,szx,syz;
263 t1 = _fjsp_load_v2r8(xyz);
264 t2 = _fjsp_load_v2r8(xyz+2);
265 t3 = _fjsp_load_v2r8(xyz+4);
266 t4 = _fjsp_load_v2r8(xyz+6);
267 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz+8);
269 sxy = _fjsp_load_v2r8(xyz_shift);
270 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
271 szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
272 syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
274 t1 = _fjsp_add_v2r8(t1,sxy);
275 t2 = _fjsp_add_v2r8(t2,szx);
276 t3 = _fjsp_add_v2r8(t3,syz);
277 t4 = _fjsp_add_v2r8(t4,sxy);
278 t5 = _fjsp_add_v2r8(t5,sz);
280 *x1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
281 *y1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
282 *z1 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
283 *x2 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
284 *y2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
285 *z2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
286 *x3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
287 *y3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
288 *z3 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
292 static gmx_inline void
293 gmx_fjsp_load_shift_and_4rvec_broadcast_v2r8(const double * gmx_restrict xyz_shift,
294 const double * gmx_restrict xyz,
295 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
296 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
297 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
298 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
300 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,sxy,sz,szx,syz;
302 t1 = _fjsp_load_v2r8(xyz);
303 t2 = _fjsp_load_v2r8(xyz+2);
304 t3 = _fjsp_load_v2r8(xyz+4);
305 t4 = _fjsp_load_v2r8(xyz+6);
306 t5 = _fjsp_load_v2r8(xyz+8);
307 t6 = _fjsp_load_v2r8(xyz+10);
309 sxy = _fjsp_load_v2r8(xyz_shift);
310 sz = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),xyz_shift+2);
311 szx = _fjsp_shuffle_v2r8(sz,sxy,GMX_FJSP_SHUFFLE2(0,0));
312 syz = _fjsp_shuffle_v2r8(sxy,sz,GMX_FJSP_SHUFFLE2(0,1));
314 t1 = _fjsp_add_v2r8(t1,sxy);
315 t2 = _fjsp_add_v2r8(t2,szx);
316 t3 = _fjsp_add_v2r8(t3,syz);
317 t4 = _fjsp_add_v2r8(t4,sxy);
318 t5 = _fjsp_add_v2r8(t5,szx);
319 t6 = _fjsp_add_v2r8(t6,syz);
321 *x1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(0,0));
322 *y1 = _fjsp_shuffle_v2r8(t1,t1,GMX_FJSP_SHUFFLE2(1,1));
323 *z1 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(0,0));
324 *x2 = _fjsp_shuffle_v2r8(t2,t2,GMX_FJSP_SHUFFLE2(1,1));
325 *y2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(0,0));
326 *z2 = _fjsp_shuffle_v2r8(t3,t3,GMX_FJSP_SHUFFLE2(1,1));
327 *x3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(0,0));
328 *y3 = _fjsp_shuffle_v2r8(t4,t4,GMX_FJSP_SHUFFLE2(1,1));
329 *z3 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(0,0));
330 *x4 = _fjsp_shuffle_v2r8(t5,t5,GMX_FJSP_SHUFFLE2(1,1));
331 *y4 = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(0,0));
332 *z4 = _fjsp_shuffle_v2r8(t6,t6,GMX_FJSP_SHUFFLE2(1,1));
337 static gmx_inline void
338 gmx_fjsp_load_1rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
339 _fjsp_v2r8 * gmx_restrict x, _fjsp_v2r8 * gmx_restrict y, _fjsp_v2r8 * gmx_restrict z)
341 *x = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
342 *y = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
343 *z = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
346 static gmx_inline void
347 gmx_fjsp_load_3rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
348 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
349 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
350 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
352 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
353 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
354 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
355 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
356 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
357 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
358 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
359 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
360 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
363 static gmx_inline void
364 gmx_fjsp_load_4rvec_1ptr_swizzle_v2r8(const double * gmx_restrict p1,
365 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
366 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
367 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
368 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
370 *x1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1);
371 *y1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+1);
372 *z1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+2);
373 *x2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+3);
374 *y2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+4);
375 *z2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+5);
376 *x3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+6);
377 *y3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+7);
378 *z3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+8);
379 *x4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+9);
380 *y4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+10);
381 *z4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),p1+11);
385 static gmx_inline void
386 gmx_fjsp_load_1rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA,
387 const double * gmx_restrict ptrB,
388 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1)
390 _fjsp_v2r8 t1,t2,t3,t4;
391 t1 = _fjsp_load_v2r8(ptrA);
392 t2 = _fjsp_load_v2r8(ptrB);
393 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
394 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
395 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
398 *z1 = _fjsp_unpacklo_v2r8(t3,t4);
401 static gmx_inline void
402 gmx_fjsp_load_3rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
403 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
404 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
405 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3)
407 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
408 t1 = _fjsp_load_v2r8(ptrA);
409 t2 = _fjsp_load_v2r8(ptrB);
410 t3 = _fjsp_load_v2r8(ptrA+2);
411 t4 = _fjsp_load_v2r8(ptrB+2);
412 t5 = _fjsp_load_v2r8(ptrA+4);
413 t6 = _fjsp_load_v2r8(ptrB+4);
414 t7 = _fjsp_load_v2r8(ptrA+6);
415 t8 = _fjsp_load_v2r8(ptrB+6);
416 t9 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
417 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
418 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
419 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
420 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
421 GMX_FJSP_TRANSPOSE2_V2R8(t7,t8);
430 *z3 = _fjsp_unpacklo_v2r8(t9,t10);
434 static gmx_inline void
435 gmx_fjsp_load_4rvec_2ptr_swizzle_v2r8(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
436 _fjsp_v2r8 * gmx_restrict x1, _fjsp_v2r8 * gmx_restrict y1, _fjsp_v2r8 * gmx_restrict z1,
437 _fjsp_v2r8 * gmx_restrict x2, _fjsp_v2r8 * gmx_restrict y2, _fjsp_v2r8 * gmx_restrict z2,
438 _fjsp_v2r8 * gmx_restrict x3, _fjsp_v2r8 * gmx_restrict y3, _fjsp_v2r8 * gmx_restrict z3,
439 _fjsp_v2r8 * gmx_restrict x4, _fjsp_v2r8 * gmx_restrict y4, _fjsp_v2r8 * gmx_restrict z4)
441 _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
442 t1 = _fjsp_load_v2r8(ptrA);
443 t2 = _fjsp_load_v2r8(ptrB);
444 t3 = _fjsp_load_v2r8(ptrA+2);
445 t4 = _fjsp_load_v2r8(ptrB+2);
446 t5 = _fjsp_load_v2r8(ptrA+4);
447 t6 = _fjsp_load_v2r8(ptrB+4);
448 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
449 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
450 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
457 t1 = _fjsp_load_v2r8(ptrA+6);
458 t2 = _fjsp_load_v2r8(ptrB+6);
459 t3 = _fjsp_load_v2r8(ptrA+8);
460 t4 = _fjsp_load_v2r8(ptrB+8);
461 t5 = _fjsp_load_v2r8(ptrA+10);
462 t6 = _fjsp_load_v2r8(ptrB+10);
463 GMX_FJSP_TRANSPOSE2_V2R8(t1,t2);
464 GMX_FJSP_TRANSPOSE2_V2R8(t3,t4);
465 GMX_FJSP_TRANSPOSE2_V2R8(t5,t6);
476 gmx_fjsp_decrement_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
477 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
481 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
482 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
483 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
485 t1 = _fjsp_sub_v2r8(t1,x1);
486 t2 = _fjsp_sub_v2r8(t2,y1);
487 t3 = _fjsp_sub_v2r8(t3,z1);
488 _fjsp_storel_v2r8(ptrA,t1);
489 _fjsp_storel_v2r8(ptrA+1,t2);
490 _fjsp_storel_v2r8(ptrA+2,t3);
494 gmx_fjsp_decrement_fma_1rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA, _fjsp_v2r8 fscal,
495 _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
499 t1 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA);
500 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+1);
501 t3 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
503 t1 = _fjsp_nmsub_v2r8(fscal,dx1,t1);
504 t2 = _fjsp_nmsub_v2r8(fscal,dy1,t2);
505 t3 = _fjsp_nmsub_v2r8(fscal,dz1,t3);
506 _fjsp_storel_v2r8(ptrA,t1);
507 _fjsp_storel_v2r8(ptrA+1,t2);
508 _fjsp_storel_v2r8(ptrA+2,t3);
513 gmx_fjsp_decrement_3rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
514 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
515 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
516 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
518 _fjsp_v2r8 t1,t2,t3,t4,t5;
520 t1 = _fjsp_load_v2r8(ptrA);
521 t2 = _fjsp_load_v2r8(ptrA+2);
522 t3 = _fjsp_load_v2r8(ptrA+4);
523 t4 = _fjsp_load_v2r8(ptrA+6);
524 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
526 x1 = _fjsp_unpacklo_v2r8(x1,y1);
527 z1 = _fjsp_unpacklo_v2r8(z1,x2);
528 y2 = _fjsp_unpacklo_v2r8(y2,z2);
529 x3 = _fjsp_unpacklo_v2r8(x3,y3);
530 /* nothing to be done for z3 */
532 t1 = _fjsp_sub_v2r8(t1,x1);
533 t2 = _fjsp_sub_v2r8(t2,z1);
534 t3 = _fjsp_sub_v2r8(t3,y2);
535 t4 = _fjsp_sub_v2r8(t4,x3);
536 t5 = _fjsp_sub_v2r8(t5,z3);
537 _fjsp_storel_v2r8(ptrA,t1);
538 _fjsp_storeh_v2r8(ptrA+1,t1);
539 _fjsp_storel_v2r8(ptrA+2,t2);
540 _fjsp_storeh_v2r8(ptrA+3,t2);
541 _fjsp_storel_v2r8(ptrA+4,t3);
542 _fjsp_storeh_v2r8(ptrA+5,t3);
543 _fjsp_storel_v2r8(ptrA+6,t4);
544 _fjsp_storeh_v2r8(ptrA+7,t4);
545 _fjsp_storel_v2r8(ptrA+8,t5);
550 gmx_fjsp_decrement_4rvec_1ptr_swizzle_v2r8(double * gmx_restrict ptrA,
551 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
552 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
553 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
554 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
556 _fjsp_v2r8 t1,t2,t3,t4,t5,t6;
558 t1 = _fjsp_load_v2r8(ptrA);
559 t2 = _fjsp_load_v2r8(ptrA+2);
560 t3 = _fjsp_load_v2r8(ptrA+4);
561 t4 = _fjsp_load_v2r8(ptrA+6);
562 t5 = _fjsp_load_v2r8(ptrA+8);
563 t6 = _fjsp_load_v2r8(ptrA+10);
565 x1 = _fjsp_unpacklo_v2r8(x1,y1);
566 z1 = _fjsp_unpacklo_v2r8(z1,x2);
567 y2 = _fjsp_unpacklo_v2r8(y2,z2);
568 x3 = _fjsp_unpacklo_v2r8(x3,y3);
569 z3 = _fjsp_unpacklo_v2r8(z3,x4);
570 y4 = _fjsp_unpacklo_v2r8(y4,z4);
572 _fjsp_storel_v2r8(ptrA, _fjsp_sub_v2r8( t1,x1 ));
573 _fjsp_storeh_v2r8(ptrA+1, _fjsp_sub_v2r8( t1,x1 ));
574 _fjsp_storel_v2r8(ptrA+2, _fjsp_sub_v2r8( t2,z1 ));
575 _fjsp_storeh_v2r8(ptrA+3, _fjsp_sub_v2r8( t2,z1 ));
576 _fjsp_storel_v2r8(ptrA+4, _fjsp_sub_v2r8( t3,y2 ));
577 _fjsp_storeh_v2r8(ptrA+5, _fjsp_sub_v2r8( t3,y2 ));
578 _fjsp_storel_v2r8(ptrA+6, _fjsp_sub_v2r8( t4,x3 ));
579 _fjsp_storeh_v2r8(ptrA+7, _fjsp_sub_v2r8( t4,x3 ));
580 _fjsp_storel_v2r8(ptrA+8, _fjsp_sub_v2r8( t5,z3 ));
581 _fjsp_storeh_v2r8(ptrA+9, _fjsp_sub_v2r8( t5,z3 ));
582 _fjsp_storel_v2r8(ptrA+10, _fjsp_sub_v2r8( t6,y4 ));
583 _fjsp_storeh_v2r8(ptrA+11, _fjsp_sub_v2r8( t6,y4 ));
587 gmx_fjsp_decrement_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
588 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1)
590 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7;
592 t1 = _fjsp_load_v2r8(ptrA);
593 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
594 t3 = _fjsp_load_v2r8(ptrB);
595 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
597 t5 = _fjsp_unpacklo_v2r8(x1,y1);
598 t6 = _fjsp_unpackhi_v2r8(x1,y1);
599 t7 = _fjsp_unpackhi_v2r8(z1,z1);
601 t1 = _fjsp_sub_v2r8(t1,t5);
602 t2 = _fjsp_sub_v2r8(t2,z1);
604 t3 = _fjsp_sub_v2r8(t3,t6);
605 t4 = _fjsp_sub_v2r8(t4,t7);
607 _fjsp_storel_v2r8(ptrA,t1);
608 _fjsp_storeh_v2r8(ptrA+1,t1);
609 _fjsp_storel_v2r8(ptrA+2,t2);
610 _fjsp_storel_v2r8(ptrB,t3);
611 _fjsp_storeh_v2r8(ptrB+1,t3);
612 _fjsp_storel_v2r8(ptrB+2,t4);
617 gmx_fjsp_decrement_fma_1rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
618 _fjsp_v2r8 fscal, _fjsp_v2r8 dx1, _fjsp_v2r8 dy1, _fjsp_v2r8 dz1)
620 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,fscalA,fscalB;
622 t1 = _fjsp_load_v2r8(ptrA);
623 t2 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+2);
624 t3 = _fjsp_load_v2r8(ptrB);
625 t4 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+2);
626 fscalA = _fjsp_unpacklo_v2r8(fscal,fscal);
627 fscalB = _fjsp_unpackhi_v2r8(fscal,fscal);
629 t5 = _fjsp_unpacklo_v2r8(dx1,dy1);
630 t6 = _fjsp_unpackhi_v2r8(dx1,dy1);
631 t7 = _fjsp_unpackhi_v2r8(dz1,dz1);
633 t1 = _fjsp_nmsub_v2r8(fscalA,t5,t1);
634 t2 = _fjsp_nmsub_v2r8(fscalA,dz1,t2);
636 t3 = _fjsp_nmsub_v2r8(fscalB,t6,t3);
637 t4 = _fjsp_nmsub_v2r8(fscalB,t7,t4);
639 _fjsp_storel_v2r8(ptrA,t1);
640 _fjsp_storeh_v2r8(ptrA+1,t1);
641 _fjsp_storel_v2r8(ptrA+2,t2);
642 _fjsp_storel_v2r8(ptrB,t3);
643 _fjsp_storeh_v2r8(ptrB+1,t3);
644 _fjsp_storel_v2r8(ptrB+2,t4);
649 gmx_fjsp_decrement_3rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
650 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
651 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
652 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3)
654 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
655 _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI;
657 t1 = _fjsp_load_v2r8(ptrA);
658 t2 = _fjsp_load_v2r8(ptrA+2);
659 t3 = _fjsp_load_v2r8(ptrA+4);
660 t4 = _fjsp_load_v2r8(ptrA+6);
661 t5 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA+8);
662 t6 = _fjsp_load_v2r8(ptrB);
663 t7 = _fjsp_load_v2r8(ptrB+2);
664 t8 = _fjsp_load_v2r8(ptrB+4);
665 t9 = _fjsp_load_v2r8(ptrB+6);
666 t10 = _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB+8);
668 tA = _fjsp_unpacklo_v2r8(x1,y1);
669 tB = _fjsp_unpackhi_v2r8(x1,y1);
670 tC = _fjsp_unpacklo_v2r8(z1,x2);
671 tD = _fjsp_unpackhi_v2r8(z1,x2);
672 tE = _fjsp_unpacklo_v2r8(y2,z2);
673 tF = _fjsp_unpackhi_v2r8(y2,z2);
674 tG = _fjsp_unpacklo_v2r8(x3,y3);
675 tH = _fjsp_unpackhi_v2r8(x3,y3);
676 tI = _fjsp_unpackhi_v2r8(z3,z3);
678 t1 = _fjsp_sub_v2r8(t1,tA);
679 t2 = _fjsp_sub_v2r8(t2,tC);
680 t3 = _fjsp_sub_v2r8(t3,tE);
681 t4 = _fjsp_sub_v2r8(t4,tG);
682 t5 = _fjsp_sub_v2r8(t5,z3);
684 t6 = _fjsp_sub_v2r8(t6,tB);
685 t7 = _fjsp_sub_v2r8(t7,tD);
686 t8 = _fjsp_sub_v2r8(t8,tF);
687 t9 = _fjsp_sub_v2r8(t9,tH);
688 t10 = _fjsp_sub_v2r8(t10,tI);
690 _fjsp_storel_v2r8(ptrA,t1);
691 _fjsp_storeh_v2r8(ptrA+1,t1);
692 _fjsp_storel_v2r8(ptrA+2,t2);
693 _fjsp_storeh_v2r8(ptrA+3,t2);
694 _fjsp_storel_v2r8(ptrA+4,t3);
695 _fjsp_storeh_v2r8(ptrA+5,t3);
696 _fjsp_storel_v2r8(ptrA+6,t4);
697 _fjsp_storeh_v2r8(ptrA+7,t4);
698 _fjsp_storel_v2r8(ptrA+8,t5);
699 _fjsp_storel_v2r8(ptrB,t6);
700 _fjsp_storeh_v2r8(ptrB+1,t6);
701 _fjsp_storel_v2r8(ptrB+2,t7);
702 _fjsp_storeh_v2r8(ptrB+3,t7);
703 _fjsp_storel_v2r8(ptrB+4,t8);
704 _fjsp_storeh_v2r8(ptrB+5,t8);
705 _fjsp_storel_v2r8(ptrB+6,t9);
706 _fjsp_storeh_v2r8(ptrB+7,t9);
707 _fjsp_storel_v2r8(ptrB+8,t10);
712 gmx_fjsp_decrement_4rvec_2ptr_swizzle_v2r8(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
713 _fjsp_v2r8 x1, _fjsp_v2r8 y1, _fjsp_v2r8 z1,
714 _fjsp_v2r8 x2, _fjsp_v2r8 y2, _fjsp_v2r8 z2,
715 _fjsp_v2r8 x3, _fjsp_v2r8 y3, _fjsp_v2r8 z3,
716 _fjsp_v2r8 x4, _fjsp_v2r8 y4, _fjsp_v2r8 z4)
718 _fjsp_v2r8 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12;
719 _fjsp_v2r8 tA,tB,tC,tD,tE,tF,tG,tH,tI,tJ,tK,tL;
721 t1 = _fjsp_load_v2r8(ptrA);
722 t2 = _fjsp_load_v2r8(ptrA+2);
723 t3 = _fjsp_load_v2r8(ptrA+4);
724 t4 = _fjsp_load_v2r8(ptrA+6);
725 t5 = _fjsp_load_v2r8(ptrA+8);
726 t6 = _fjsp_load_v2r8(ptrA+10);
727 t7 = _fjsp_load_v2r8(ptrB);
728 t8 = _fjsp_load_v2r8(ptrB+2);
729 t9 = _fjsp_load_v2r8(ptrB+4);
730 t10 = _fjsp_load_v2r8(ptrB+6);
731 t11 = _fjsp_load_v2r8(ptrB+8);
732 t12 = _fjsp_load_v2r8(ptrB+10);
734 tA = _fjsp_unpacklo_v2r8(x1,y1);
735 tB = _fjsp_unpackhi_v2r8(x1,y1);
736 tC = _fjsp_unpacklo_v2r8(z1,x2);
737 tD = _fjsp_unpackhi_v2r8(z1,x2);
738 tE = _fjsp_unpacklo_v2r8(y2,z2);
739 tF = _fjsp_unpackhi_v2r8(y2,z2);
740 tG = _fjsp_unpacklo_v2r8(x3,y3);
741 tH = _fjsp_unpackhi_v2r8(x3,y3);
742 tI = _fjsp_unpacklo_v2r8(z3,x4);
743 tJ = _fjsp_unpackhi_v2r8(z3,x4);
744 tK = _fjsp_unpacklo_v2r8(y4,z4);
745 tL = _fjsp_unpackhi_v2r8(y4,z4);
747 t1 = _fjsp_sub_v2r8(t1,tA);
748 t2 = _fjsp_sub_v2r8(t2,tC);
749 t3 = _fjsp_sub_v2r8(t3,tE);
750 t4 = _fjsp_sub_v2r8(t4,tG);
751 t5 = _fjsp_sub_v2r8(t5,tI);
752 t6 = _fjsp_sub_v2r8(t6,tK);
754 t7 = _fjsp_sub_v2r8(t7,tB);
755 t8 = _fjsp_sub_v2r8(t8,tD);
756 t9 = _fjsp_sub_v2r8(t9,tF);
757 t10 = _fjsp_sub_v2r8(t10,tH);
758 t11 = _fjsp_sub_v2r8(t11,tJ);
759 t12 = _fjsp_sub_v2r8(t12,tL);
761 _fjsp_storel_v2r8(ptrA, t1);
762 _fjsp_storeh_v2r8(ptrA+1,t1);
763 _fjsp_storel_v2r8(ptrA+2,t2);
764 _fjsp_storeh_v2r8(ptrA+3,t2);
765 _fjsp_storel_v2r8(ptrA+4,t3);
766 _fjsp_storeh_v2r8(ptrA+5,t3);
767 _fjsp_storel_v2r8(ptrA+6,t4);
768 _fjsp_storeh_v2r8(ptrA+7,t4);
769 _fjsp_storel_v2r8(ptrA+8,t5);
770 _fjsp_storeh_v2r8(ptrA+9,t5);
771 _fjsp_storel_v2r8(ptrA+10,t6);
772 _fjsp_storeh_v2r8(ptrA+11,t6);
773 _fjsp_storel_v2r8(ptrB, t7);
774 _fjsp_storeh_v2r8(ptrB+1,t7);
775 _fjsp_storel_v2r8(ptrB+2,t8);
776 _fjsp_storeh_v2r8(ptrB+3,t8);
777 _fjsp_storel_v2r8(ptrB+4,t9);
778 _fjsp_storeh_v2r8(ptrB+5,t9);
779 _fjsp_storel_v2r8(ptrB+6,t10);
780 _fjsp_storeh_v2r8(ptrB+7,t10);
781 _fjsp_storel_v2r8(ptrB+8,t11);
782 _fjsp_storeh_v2r8(ptrB+9,t11);
783 _fjsp_storel_v2r8(ptrB+10,t12);
784 _fjsp_storeh_v2r8(ptrB+11,t12);
789 static gmx_inline void
790 gmx_fjsp_update_iforce_1atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
791 double * gmx_restrict fptr,
792 double * gmx_restrict fshiftptr)
798 fix1 = _fjsp_unpacklo_v2r8(fix1,fiy1); /* y0 x0 */
799 fiy1 = _fjsp_unpackhi_v2r8(t1,fiy1); /* y1 x1 */
801 fix1 = _fjsp_add_v2r8(fix1,fiy1);
802 fiz1 = _fjsp_add_v2r8( fiz1, _fjsp_unpackhi_v2r8(fiz1,fiz1 ));
804 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
805 _fjsp_storel_v2r8( fptr, t4 );
806 _fjsp_storeh_v2r8( fptr+1, t4 );
807 _fjsp_storel_v2r8( fptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+2), fiz1 ));
809 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
810 _fjsp_storel_v2r8( fshiftptr, t4 );
811 _fjsp_storeh_v2r8( fshiftptr+1, t4 );
812 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
815 static gmx_inline void
816 gmx_fjsp_update_iforce_3atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
817 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
818 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
819 double * gmx_restrict fptr,
820 double * gmx_restrict fshiftptr)
822 __m128d t1,t2,t3,t4,t5,t6;
825 GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
826 GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
827 GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
829 fix3 = _fjsp_unpacklo_v2r8(fix3,fiy3); /* y0 x0 */
830 fiy3 = _fjsp_unpackhi_v2r8(t1,fiy3); /* y1 x1 */
832 fix1 = _fjsp_add_v2r8(fix1,fiy1);
833 fiz1 = _fjsp_add_v2r8(fiz1,fix2);
834 fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
836 fix3 = _fjsp_add_v2r8(fix3,fiy3);
837 fiz3 = _fjsp_add_v2r8( fiz3, _fjsp_unpackhi_v2r8(fiz3,fiz3));
839 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
840 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
841 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
842 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
844 _fjsp_storel_v2r8( fptr, t3 );
845 _fjsp_storeh_v2r8( fptr+1, t3 );
846 _fjsp_storel_v2r8( fptr+2, t4 );
847 _fjsp_storeh_v2r8( fptr+3, t4 );
848 _fjsp_storel_v2r8( fptr+4, t5 );
849 _fjsp_storeh_v2r8( fptr+5, t5 );
850 _fjsp_storel_v2r8( fptr+6, t6 );
851 _fjsp_storeh_v2r8( fptr+7, t6 );
852 _fjsp_storel_v2r8( fptr+8, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fptr+8), fiz3 ));
854 fix1 = _fjsp_add_v2r8(fix1,fix3);
855 t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
856 fix1 = _fjsp_add_v2r8(fix1,t1); /* x and y sums */
858 t2 = _fjsp_shuffle_v2r8(fiy2,fiy2,GMX_FJSP_SHUFFLE2(1,1));
859 fiz1 = _fjsp_add_v2r8(fiz1,fiz3);
860 fiz1 = _fjsp_add_v2r8(fiz1,t2); /* z sum */
862 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
863 _fjsp_storel_v2r8( fshiftptr, t3 );
864 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
865 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
869 static gmx_inline void
870 gmx_fjsp_update_iforce_4atom_swizzle_v2r8(_fjsp_v2r8 fix1, _fjsp_v2r8 fiy1, _fjsp_v2r8 fiz1,
871 _fjsp_v2r8 fix2, _fjsp_v2r8 fiy2, _fjsp_v2r8 fiz2,
872 _fjsp_v2r8 fix3, _fjsp_v2r8 fiy3, _fjsp_v2r8 fiz3,
873 _fjsp_v2r8 fix4, _fjsp_v2r8 fiy4, _fjsp_v2r8 fiz4,
874 double * gmx_restrict fptr,
875 double * gmx_restrict fshiftptr)
877 __m128d t1,t2,t3,t4,t5,t6,t7,t8;
880 GMX_FJSP_TRANSPOSE2_V2R8(fix1,fiy1);
881 GMX_FJSP_TRANSPOSE2_V2R8(fiz1,fix2);
882 GMX_FJSP_TRANSPOSE2_V2R8(fiy2,fiz2);
883 GMX_FJSP_TRANSPOSE2_V2R8(fix3,fiy3);
884 GMX_FJSP_TRANSPOSE2_V2R8(fiz3,fix4);
885 GMX_FJSP_TRANSPOSE2_V2R8(fiy4,fiz4);
887 fix1 = _fjsp_add_v2r8(fix1,fiy1);
888 fiz1 = _fjsp_add_v2r8(fiz1,fix2);
889 fiy2 = _fjsp_add_v2r8(fiy2,fiz2);
890 fix3 = _fjsp_add_v2r8(fix3,fiy3);
891 fiz3 = _fjsp_add_v2r8(fiz3,fix4);
892 fiy4 = _fjsp_add_v2r8(fiy4,fiz4);
894 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr), fix1 );
895 t4 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+2), fiz1 );
896 t5 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+4), fiy2 );
897 t6 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+6), fix3 );
898 t7 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+8), fiz3 );
899 t8 = _fjsp_add_v2r8( _fjsp_load_v2r8(fptr+10), fiy4 );
900 _fjsp_storel_v2r8( fptr, t3 );
901 _fjsp_storeh_v2r8( fptr+1, t3 );
902 _fjsp_storel_v2r8( fptr+2, t4 );
903 _fjsp_storeh_v2r8( fptr+3, t4 );
904 _fjsp_storel_v2r8( fptr+4, t5 );
905 _fjsp_storeh_v2r8( fptr+5, t5 );
906 _fjsp_storel_v2r8( fptr+6, t6 );
907 _fjsp_storeh_v2r8( fptr+7, t6 );
908 _fjsp_storel_v2r8( fptr+8, t7 );
909 _fjsp_storeh_v2r8( fptr+9, t7 );
910 _fjsp_storel_v2r8( fptr+10, t8 );
911 _fjsp_storeh_v2r8( fptr+11, t8 );
913 t1 = _fjsp_shuffle_v2r8(fiz1,fiy2,GMX_FJSP_SHUFFLE2(0,1));
914 fix1 = _fjsp_add_v2r8(fix1,t1);
915 t2 = _fjsp_shuffle_v2r8(fiz3,fiy4,GMX_FJSP_SHUFFLE2(0,1));
916 fix3 = _fjsp_add_v2r8(fix3,t2);
917 fix1 = _fjsp_add_v2r8(fix1,fix3); /* x and y sums */
919 fiz1 = _fjsp_add_v2r8(fiz1, _fjsp_unpackhi_v2r8(fiy2,fiy2));
920 fiz3 = _fjsp_add_v2r8(fiz3, _fjsp_unpackhi_v2r8(fiy4,fiy4));
921 fiz1 = _fjsp_add_v2r8(fiz1,fiz3); /* z sum */
923 t3 = _fjsp_add_v2r8( _fjsp_load_v2r8(fshiftptr), fix1 );
924 _fjsp_storel_v2r8( fshiftptr, t3 );
925 _fjsp_storeh_v2r8( fshiftptr+1, t3 );
926 _fjsp_storel_v2r8( fshiftptr+2, _fjsp_add_v2r8( _fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),fshiftptr+2), fiz1 ));
931 static gmx_inline void
932 gmx_fjsp_update_1pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA)
934 pot1 = _fjsp_add_v2r8(pot1, _fjsp_unpackhi_v2r8(pot1,pot1));
935 _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
938 static gmx_inline void
939 gmx_fjsp_update_2pot_v2r8(_fjsp_v2r8 pot1, double * gmx_restrict ptrA,
940 _fjsp_v2r8 pot2, double * gmx_restrict ptrB)
942 GMX_FJSP_TRANSPOSE2_V2R8(pot1,pot2);
943 pot1 = _fjsp_add_v2r8(pot1,pot2);
944 pot2 = _fjsp_unpackhi_v2r8(pot1,pot1);
946 _fjsp_storel_v2r8(ptrA,_fjsp_add_v2r8(pot1,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrA)));
947 _fjsp_storel_v2r8(ptrB,_fjsp_add_v2r8(pot2,_fjsp_loadl_v2r8(_fjsp_setzero_v2r8(),ptrB)));
951 #endif /* _kernelutil_sparc64_hpc_ace_double_h_ */