2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
34 #include <emmintrin.h>
41 gmx_printxmm_ps(const char *s,__m128 xmm)
46 printf("%s: %15.10g %15.10g %15.10g %15.10g\n",s,f[0],f[1],f[2],f[3]);
51 #define GMX_MM_LOAD_1JCOORD_1ATOM_PS(ptr1,jx1,jy1,jz1) { \
52 jx1 = _mm_load_ss(ptr1); \
53 jy1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+1));\
54 jz1 = _mm_shuffle_ps(jy1,jy1,_MM_SHUFFLE(0,0,0,1)); \
58 #define GMX_MM_LOAD_1JCOORD_2ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
59 jx1 = _mm_loadu_ps(ptr1); \
60 jy1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1)); \
61 jz1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
62 jx2 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3)); \
63 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));\
64 jz2 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1)); \
68 #define GMX_MM_LOAD_1JCOORD_3ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
69 jx1 = _mm_loadu_ps(ptr1); \
70 jy1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1)); \
71 jz1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
72 jx2 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3)); \
73 jy2 = _mm_loadu_ps(ptr1+4); \
74 jz2 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1)); \
75 jx3 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,2)); \
76 jy3 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,3)); \
77 jz3 = _mm_load_ss(ptr1+8); \
81 #define GMX_MM_LOAD_1JCOORD_4ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
82 jx1 = _mm_loadu_ps(ptr1); \
83 jy1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1)); \
84 jz1 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
85 jx2 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3)); \
86 jy2 = _mm_loadu_ps(ptr1+4); \
87 jz2 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1)); \
88 jx3 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,2)); \
89 jy3 = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,3)); \
90 jz3 = _mm_loadu_ps(ptr1+8); \
91 jx4 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
92 jy4 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,2)); \
93 jz4 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,3)); \
97 #define GMX_MM_LOAD_2JCOORD_1ATOM_PS(ptr1,ptr2,jx1,jy1,jz1) { \
99 _tmp1 = _mm_load_ss(ptr1); \
100 _tmp2 = _mm_load_ss(ptr2); \
101 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
102 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
103 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
104 jy1 = _mm_unpackhi_ps(_tmp1,_tmp2); \
105 jx1 = _mm_unpacklo_ps(_tmp1,_tmp2); \
106 jz1 = _mm_movehl_ps(jz1,jy1); \
110 #define GMX_MM_LOAD_2JCOORD_2ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
111 __m128 _tmp1, _tmp2; \
112 _tmp1 = _mm_loadu_ps(ptr1); \
113 jy1 = _mm_loadu_ps(ptr2); \
114 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
115 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
116 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
117 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
118 jy2 = _mm_unpacklo_ps(jy2,_tmp2); \
119 jy1 = _mm_movehl_ps(jx1,jx1); \
120 jx2 = _mm_movehl_ps(jz1,jz1); \
121 jz2 = _mm_movehl_ps(jy2,jy2); \
125 #define GMX_MM_LOAD_2JCOORD_3ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
126 __m128 _tmp1, _tmp2, _tmp3; \
127 _tmp1 = _mm_loadu_ps(ptr1); \
128 jy1 = _mm_loadu_ps(ptr2); \
129 _tmp2 = _mm_loadu_ps(ptr1+4); \
130 jz2 = _mm_loadu_ps(ptr2+4); \
131 jz3 = _mm_load_ss(ptr1+8); \
132 _tmp3 = _mm_load_ss(ptr2+8); \
133 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
134 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
135 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
136 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
137 jy1 = _mm_movehl_ps(jx1,jx1); \
138 jx2 = _mm_movehl_ps(jz1,jz1); \
139 jz2 = _mm_movehl_ps(jy2,jy2); \
140 jy3 = _mm_movehl_ps(jx3,jx3); \
141 jz3 = _mm_unpacklo_ps(jz3,_tmp3); \
145 #define GMX_MM_LOAD_2JCOORD_4ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
146 __m128 _tmp1, _tmp2, _tmp3,_tmp4; \
147 _tmp1 = _mm_loadu_ps(ptr1); \
148 jy1 = _mm_loadu_ps(ptr2); \
149 _tmp2 = _mm_loadu_ps(ptr1+4); \
150 jz2 = _mm_loadu_ps(ptr2+4); \
151 _tmp3 = _mm_loadu_ps(ptr1+8); \
152 _tmp4 = _mm_loadu_ps(ptr2+8); \
153 jx1 = _mm_unpacklo_ps(_tmp1,jy1); \
154 jz1 = _mm_unpackhi_ps(_tmp1,jy1); \
155 jy2 = _mm_unpacklo_ps(_tmp2,jz2); \
156 jx3 = _mm_unpackhi_ps(_tmp2,jz2); \
157 jz3 = _mm_unpacklo_ps(_tmp3,_tmp4); \
158 jy4 = _mm_unpackhi_ps(_tmp3,_tmp4); \
159 jy1 = _mm_movehl_ps(jx1,jx1); \
160 jx2 = _mm_movehl_ps(jz1,jz1); \
161 jz2 = _mm_movehl_ps(jy2,jy2); \
162 jy3 = _mm_movehl_ps(jx3,jx3); \
163 jx4 = _mm_movehl_ps(jz3,jz3); \
164 jz4 = _mm_movehl_ps(jy4,jy4); \
168 #define GMX_MM_LOAD_3JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
169 __m128 _tmp1,_tmp3,_tmp4; \
170 jx1 = _mm_load_ss(ptr1); \
171 jy1 = _mm_load_ss(ptr2); \
172 jz1 = _mm_load_ss(ptr3); \
173 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
174 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1)); \
175 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1)); \
176 _tmp1 = _mm_unpacklo_ps(jx1,jy1); \
177 _tmp3 = _mm_unpackhi_ps(jx1,jy1); \
178 _tmp4 = _mm_unpackhi_ps(jz1,jz1); \
179 jx1 = _mm_movelh_ps(_tmp1,jz1); \
180 jy1 = _mm_movelh_ps(_tmp3,_tmp4); \
181 jz1 = _mm_movehl_ps(_tmp4,_tmp3); \
185 #define GMX_MM_LOAD_3JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
186 __m128 _tmp1, _tmp2; \
187 jx1 = _mm_loadu_ps(ptr1); \
188 jy1 = _mm_loadu_ps(ptr2); \
189 jz1 = _mm_loadu_ps(ptr3); \
190 jx2 = _mm_setzero_ps(); \
191 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
192 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
193 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
194 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
195 _tmp2 = _mm_unpacklo_ps(jy2,jz2); \
196 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp1); \
197 jy2 = _mm_movelh_ps(_tmp2,_tmp1); \
198 jz2 = _mm_movehl_ps(_tmp1,_tmp2); \
202 #define GMX_MM_LOAD_3JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
203 __m128 _tmp1, _tmp2; \
204 jx1 = _mm_loadu_ps(ptr1); \
205 jy1 = _mm_loadu_ps(ptr2); \
206 jz1 = _mm_loadu_ps(ptr3); \
207 jx2 = _mm_setzero_ps(); \
208 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
209 jy2 = _mm_loadu_ps(ptr1+4); \
210 jz2 = _mm_loadu_ps(ptr2+4); \
211 jx3 = _mm_loadu_ps(ptr3+4); \
212 jy3 = _mm_setzero_ps(); \
213 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
214 jz3 = _mm_load_ss(ptr1+8); \
215 _tmp1 = _mm_load_ss(ptr2+8); \
216 _tmp2 = _mm_load_ss(ptr3+8); \
217 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
218 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp1); \
219 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
223 #define GMX_MM_LOAD_3JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
224 jx1 = _mm_loadu_ps(ptr1); \
225 jy1 = _mm_loadu_ps(ptr2); \
226 jz1 = _mm_loadu_ps(ptr3); \
227 jx2 = _mm_setzero_ps(); \
228 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
229 jy2 = _mm_loadu_ps(ptr1+4); \
230 jz2 = _mm_loadu_ps(ptr2+4); \
231 jx3 = _mm_loadu_ps(ptr3+4); \
232 jy3 = _mm_setzero_ps(); \
233 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
234 jz3 = _mm_loadu_ps(ptr1+8); \
235 jx4 = _mm_loadu_ps(ptr2+8); \
236 jy4 = _mm_loadu_ps(ptr3+8); \
237 jz4 = _mm_setzero_ps(); \
238 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
243 #define GMX_MM_LOAD_4JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
244 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
245 jx1 = _mm_load_ss(ptr1); \
246 _tmp1 = _mm_load_ss(ptr2); \
247 jy1 = _mm_load_ss(ptr3); \
248 jz1 = _mm_load_ss(ptr4); \
249 jx1 = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1)); \
250 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1)); \
251 jy1 = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1)); \
252 jz1 = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1)); \
253 _tmp2 = _mm_unpacklo_ps(jx1,_tmp1); \
254 _tmp3 = _mm_unpacklo_ps(jy1,jz1); \
255 _tmp4 = _mm_unpackhi_ps(jx1,_tmp1); \
256 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
257 jx1 = _mm_movelh_ps(_tmp2,_tmp3); \
258 jy1 = _mm_movelh_ps(_tmp4,_tmp5); \
259 jz1 = _mm_movehl_ps(_tmp5,_tmp4); \
263 #define GMX_MM_LOAD_4JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
264 __m128 _tmp1, _tmp2; \
265 jx1 = _mm_loadu_ps(ptr1); \
266 jy1 = _mm_loadu_ps(ptr2); \
267 jz1 = _mm_loadu_ps(ptr3); \
268 jx2 = _mm_loadu_ps(ptr4); \
269 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
270 jy2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
271 jz2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4)); \
272 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
273 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4)); \
274 _tmp1 = _mm_unpacklo_ps(jy2,_tmp1); \
275 _tmp2 = _mm_unpacklo_ps(jz2,_tmp2); \
276 jy2 = _mm_unpacklo_ps(_tmp1,_tmp2); \
277 jz2 = _mm_unpackhi_ps(_tmp1,_tmp2); \
281 #define GMX_MM_LOAD_4JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
282 __m128 _tmp1, _tmp2, _tmp3; \
283 jx1 = _mm_loadu_ps(ptr1); \
284 jy1 = _mm_loadu_ps(ptr2); \
285 jz1 = _mm_loadu_ps(ptr3); \
286 jx2 = _mm_loadu_ps(ptr4); \
287 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
288 jy2 = _mm_loadu_ps(ptr1+4); \
289 jz2 = _mm_loadu_ps(ptr2+4); \
290 jx3 = _mm_loadu_ps(ptr3+4); \
291 jy3 = _mm_loadu_ps(ptr4+4); \
292 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
293 jz3 = _mm_load_ss(ptr1+8); \
294 _tmp1 = _mm_load_ss(ptr2+8); \
295 _tmp2 = _mm_load_ss(ptr3+8); \
296 _tmp3 = _mm_load_ss(ptr4+8); \
297 jz3 = _mm_unpacklo_ps(jz3,_tmp2); \
298 _tmp1 = _mm_unpacklo_ps(_tmp1,_tmp3); \
299 jz3 = _mm_unpacklo_ps(jz3,_tmp1); \
303 #define GMX_MM_LOAD_4JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
304 jx1 = _mm_loadu_ps(ptr1); \
305 jy1 = _mm_loadu_ps(ptr2); \
306 jz1 = _mm_loadu_ps(ptr3); \
307 jx2 = _mm_loadu_ps(ptr4); \
308 _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2); \
309 jy2 = _mm_loadu_ps(ptr1+4); \
310 jz2 = _mm_loadu_ps(ptr2+4); \
311 jx3 = _mm_loadu_ps(ptr3+4); \
312 jy3 = _mm_loadu_ps(ptr4+4); \
313 _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3); \
314 jz3 = _mm_loadu_ps(ptr1+8); \
315 jx4 = _mm_loadu_ps(ptr2+8); \
316 jy4 = _mm_loadu_ps(ptr3+8); \
317 jz4 = _mm_loadu_ps(ptr4+8); \
318 _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4); \
322 #define GMX_MM_UPDATE_1JCOORD_1ATOM_PS(ptr1,jx1,jy1,jz1) { \
324 jy1 = _mm_unpacklo_ps(jy1,jz1); \
325 jx1 = _mm_movelh_ps(jx1,jy1); \
326 _tmp1 = _mm_load_ss(ptr1); \
327 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
328 _tmp1 = _mm_add_ps(_tmp1,jx1); \
329 _mm_store_ss(ptr1,_tmp1); \
330 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
334 #define GMX_MM_UPDATE_1JCOORD_2ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
335 __m128 _tmp1, _tmp2; \
336 _tmp1 = _mm_loadu_ps(ptr1); \
337 _tmp2 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
338 jx1 = _mm_unpacklo_ps(jx1,jy1); \
339 jz1 = _mm_unpacklo_ps(jz1,jx2); \
340 jy2 = _mm_unpacklo_ps(jy2,jz2); \
341 jx1 = _mm_movelh_ps(jx1,jz1); \
342 _tmp1 = _mm_add_ps(_tmp1,jx1); \
343 _tmp2 = _mm_add_ps(_tmp2,jy2); \
344 _mm_storeu_ps(ptr1,_tmp1); \
345 _mm_storel_pi((__m64 *)(ptr1+4),_tmp2); \
349 #define GMX_MM_UPDATE_1JCOORD_3ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
350 __m128 _tmp1, _tmp2, _tmp3; \
351 _tmp1 = _mm_loadu_ps(ptr1); \
352 _tmp2 = _mm_loadu_ps(ptr1+4); \
353 _tmp3 = _mm_load_ss(ptr1+8); \
354 jx1 = _mm_unpacklo_ps(jx1,jy1); \
355 jz1 = _mm_unpacklo_ps(jz1,jx2); \
356 jy2 = _mm_unpacklo_ps(jy2,jz2); \
357 jx3 = _mm_unpacklo_ps(jx3,jy3); \
358 jx1 = _mm_movelh_ps(jx1,jz1); \
359 jy2 = _mm_movelh_ps(jy2,jx3); \
360 _tmp1 = _mm_add_ps(_tmp1,jx1); \
361 _tmp2 = _mm_add_ps(_tmp2,jy2); \
362 _tmp3 = _mm_add_ss(_tmp3,jz3); \
363 _mm_storeu_ps(ptr1,_tmp1); \
364 _mm_storeu_ps(ptr1+4,_tmp2); \
365 _mm_store_ss(ptr1+8,_tmp3); \
369 #define GMX_MM_UPDATE_1JCOORD_4ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
370 __m128 _tmp1, _tmp2, _tmp3; \
371 _tmp1 = _mm_loadu_ps(ptr1); \
372 _tmp2 = _mm_loadu_ps(ptr1+4); \
373 _tmp3 = _mm_loadu_ps(ptr1+8); \
374 jx1 = _mm_unpacklo_ps(jx1,jy1); \
375 jz1 = _mm_unpacklo_ps(jz1,jx2); \
376 jy2 = _mm_unpacklo_ps(jy2,jz2); \
377 jx3 = _mm_unpacklo_ps(jx3,jy3); \
378 jz3 = _mm_unpacklo_ps(jz3,jx4); \
379 jy4 = _mm_unpacklo_ps(jy4,jz4); \
380 jx1 = _mm_movelh_ps(jx1,jz1); \
381 jy2 = _mm_movelh_ps(jy2,jx3); \
382 jz3 = _mm_movelh_ps(jz3,jy4); \
383 _tmp1 = _mm_add_ps(_tmp1,jx1); \
384 _tmp2 = _mm_add_ps(_tmp2,jy2); \
385 _tmp3 = _mm_add_ps(_tmp3,jz3); \
386 _mm_storeu_ps(ptr1,_tmp1); \
387 _mm_storeu_ps(ptr1+4,_tmp2); \
388 _mm_storeu_ps(ptr1+8,_tmp3); \
392 #define GMX_MM_UPDATE_2JCOORD_1ATOM_PS(ptr1,ptr2,jx1,jy1,jz1) { \
393 __m128 _tmp1,_tmp2,_tmp3,_tmp4; \
394 _tmp1 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
395 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2)); \
396 _tmp2 = _mm_load_ss(ptr1+2); \
397 _tmp3 = _mm_load_ss(ptr2+2); \
398 jx1 = _mm_unpacklo_ps(jx1,jy1); \
399 _tmp4 = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1)); \
400 _tmp1 = _mm_add_ps(_tmp1,jx1); \
401 _mm_storel_pi((__m64 *)(ptr1),_tmp1); \
402 _mm_storeh_pi((__m64 *)(ptr2),_tmp1); \
403 _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1)); \
404 _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4)); \
408 #define GMX_MM_UPDATE_2JCOORD_2ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
409 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5; \
410 _tmp1 = _mm_loadu_ps(ptr1); \
411 _tmp2 = _mm_loadu_ps(ptr2); \
412 _tmp3 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
413 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4)); \
414 jx1 = _mm_unpacklo_ps(jx1,jy1); \
415 jz1 = _mm_unpacklo_ps(jz1,jx2); \
416 jy2 = _mm_unpacklo_ps(jy2,jz2); \
417 _tmp4 = _mm_movelh_ps(jx1,jz1); \
418 _tmp5 = _mm_movehl_ps(jz1,jx1); \
419 _tmp1 = _mm_add_ps(_tmp1,_tmp4); \
420 _tmp2 = _mm_add_ps(_tmp2,_tmp5); \
421 _tmp3 = _mm_add_ps(_tmp3,jy2); \
422 _mm_storeu_ps(ptr1,_tmp1); \
423 _mm_storeu_ps(ptr2,_tmp2); \
424 _mm_storel_pi((__m64 *)(ptr1+4),_tmp3); \
425 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3); \
429 #define GMX_MM_UPDATE_2JCOORD_3ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
430 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
431 _tmp1 = _mm_loadu_ps(ptr1); \
432 _tmp2 = _mm_loadu_ps(ptr1+4); \
433 _tmp3 = _mm_load_ss(ptr1+8); \
434 _tmp4 = _mm_loadu_ps(ptr2); \
435 _tmp5 = _mm_loadu_ps(ptr2+4); \
436 _tmp6 = _mm_load_ss(ptr2+8); \
437 jx1 = _mm_unpacklo_ps(jx1,jy1); \
438 jz1 = _mm_unpacklo_ps(jz1,jx2); \
439 jy2 = _mm_unpacklo_ps(jy2,jz2); \
440 jx3 = _mm_unpacklo_ps(jx3,jy3); \
441 _tmp7 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
442 _tmp8 = _mm_movelh_ps(jx1,jz1); \
443 _tmp9 = _mm_movehl_ps(jz1,jx1); \
444 _tmp10 = _mm_movelh_ps(jy2,jx3); \
445 _tmp11 = _mm_movehl_ps(jx3,jy2); \
446 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
447 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
448 _tmp3 = _mm_add_ss(_tmp3,jz3); \
449 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
450 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
451 _tmp6 = _mm_add_ss(_tmp6,_tmp7); \
452 _mm_storeu_ps(ptr1,_tmp1); \
453 _mm_storeu_ps(ptr1+4,_tmp2); \
454 _mm_store_ss(ptr1+8,_tmp3); \
455 _mm_storeu_ps(ptr2,_tmp4); \
456 _mm_storeu_ps(ptr2+4,_tmp5); \
457 _mm_store_ss(ptr2+8,_tmp6); \
461 #define GMX_MM_UPDATE_2JCOORD_4ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
462 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
463 _tmp1 = _mm_loadu_ps(ptr1); \
464 _tmp2 = _mm_loadu_ps(ptr1+4); \
465 _tmp3 = _mm_loadu_ps(ptr1+8); \
466 _tmp4 = _mm_loadu_ps(ptr2); \
467 _tmp5 = _mm_loadu_ps(ptr2+4); \
468 _tmp6 = _mm_loadu_ps(ptr2+8); \
469 jx1 = _mm_unpacklo_ps(jx1,jy1); \
470 jz1 = _mm_unpacklo_ps(jz1,jx2); \
471 jy2 = _mm_unpacklo_ps(jy2,jz2); \
472 jx3 = _mm_unpacklo_ps(jx3,jy3); \
473 jz3 = _mm_unpacklo_ps(jz3,jx4); \
474 jy4 = _mm_unpacklo_ps(jy4,jz4); \
475 _tmp8 = _mm_movelh_ps(jx1,jz1); \
476 _tmp9 = _mm_movehl_ps(jz1,jx1); \
477 _tmp10 = _mm_movelh_ps(jy2,jx3); \
478 _tmp11 = _mm_movehl_ps(jx3,jy2); \
479 _tmp12 = _mm_movelh_ps(jz3,jy4); \
480 _tmp13 = _mm_movehl_ps(jy4,jz3); \
481 _tmp1 = _mm_add_ps(_tmp1,_tmp8); \
482 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
483 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
484 _tmp4 = _mm_add_ps(_tmp4,_tmp9); \
485 _tmp5 = _mm_add_ps(_tmp5,_tmp11); \
486 _tmp6 = _mm_add_ps(_tmp6,_tmp13); \
487 _mm_storeu_ps(ptr1,_tmp1); \
488 _mm_storeu_ps(ptr1+4,_tmp2); \
489 _mm_storeu_ps(ptr1+8,_tmp3); \
490 _mm_storeu_ps(ptr2,_tmp4); \
491 _mm_storeu_ps(ptr2+4,_tmp5); \
492 _mm_storeu_ps(ptr2+8,_tmp6); \
496 #define GMX_MM_UPDATE_3JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
497 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7; \
498 _tmp1 = _mm_load_ss(ptr1); \
499 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
500 _tmp2 = _mm_load_ss(ptr2); \
501 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
502 _tmp3 = _mm_load_ss(ptr3); \
503 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
504 _tmp4 = _mm_unpacklo_ps(jy1,jz1); \
505 _tmp5 = _mm_unpackhi_ps(jy1,jz1); \
506 _tmp6 = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
507 _tmp7 = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2)); \
508 jx1 = _mm_movelh_ps(jx1,_tmp4); \
509 _tmp7 = _mm_movelh_ps(_tmp7,_tmp5); \
510 _tmp1 = _mm_add_ps(_tmp1,jx1); \
511 _tmp2 = _mm_add_ps(_tmp2,_tmp6); \
512 _tmp3 = _mm_add_ps(_tmp3,_tmp7); \
513 _mm_store_ss(ptr1,_tmp1); \
514 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
515 _mm_store_ss(ptr2,_tmp2); \
516 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
517 _mm_store_ss(ptr3,_tmp3); \
518 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
522 #define GMX_MM_UPDATE_3JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
523 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
524 _tmp1 = _mm_loadu_ps(ptr1); \
525 _tmp2 = _mm_loadu_ps(ptr2); \
526 _tmp3 = _mm_loadu_ps(ptr3); \
527 _tmp4 = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4)); \
528 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4)); \
529 _tmp5 = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4)); \
530 _tmp6 = _mm_unpackhi_ps(jx1,jy1); \
531 jx1 = _mm_unpacklo_ps(jx1,jy1); \
532 _tmp7 = _mm_unpackhi_ps(jz1,jx2); \
533 jz1 = _mm_unpacklo_ps(jz1,jx2); \
534 _tmp8 = _mm_unpackhi_ps(jy2,jz2); \
535 jy2 = _mm_unpacklo_ps(jy2,jz2); \
536 _tmp9 = _mm_movelh_ps(jx1,jz1); \
537 _tmp10 = _mm_movehl_ps(jz1,jx1); \
538 _tmp6 = _mm_movelh_ps(_tmp6,_tmp7); \
539 _tmp1 = _mm_add_ps(_tmp1,_tmp9); \
540 _tmp2 = _mm_add_ps(_tmp2,_tmp10); \
541 _tmp3 = _mm_add_ps(_tmp3,_tmp6); \
542 _tmp4 = _mm_add_ps(_tmp4,jy2); \
543 _tmp5 = _mm_add_ps(_tmp5,_tmp8); \
544 _mm_storeu_ps(ptr1,_tmp1); \
545 _mm_storeu_ps(ptr2,_tmp2); \
546 _mm_storeu_ps(ptr3,_tmp3); \
547 _mm_storel_pi((__m64 *)(ptr1+4),_tmp4); \
548 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4); \
549 _mm_storel_pi((__m64 *)(ptr3+4),_tmp5); \
553 #define GMX_MM_UPDATE_3JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
554 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
555 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
556 _tmp1 = _mm_loadu_ps(ptr1); \
557 _tmp2 = _mm_loadu_ps(ptr1+4); \
558 _tmp3 = _mm_load_ss(ptr1+8); \
559 _tmp4 = _mm_loadu_ps(ptr2); \
560 _tmp5 = _mm_loadu_ps(ptr2+4); \
561 _tmp6 = _mm_load_ss(ptr2+8); \
562 _tmp7 = _mm_loadu_ps(ptr3); \
563 _tmp8 = _mm_loadu_ps(ptr3+4); \
564 _tmp9 = _mm_load_ss(ptr3+8); \
565 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
566 jx1 = _mm_unpacklo_ps(jx1,jy1); \
567 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
568 jz1 = _mm_unpacklo_ps(jz1,jx2); \
569 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
570 jy2 = _mm_unpacklo_ps(jy2,jz2); \
571 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
572 jx3 = _mm_unpacklo_ps(jx3,jy3); \
573 _tmp14 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
574 _tmp15 = _mm_movehl_ps(jz3,jz3); \
575 _tmp16 = _mm_movelh_ps(jx1,jz1); \
576 _tmp17 = _mm_movehl_ps(jz1,jx1); \
577 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
578 _tmp18 = _mm_movelh_ps(jy2,jx3); \
579 _tmp19 = _mm_movehl_ps(jx3,jy2); \
580 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
581 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
582 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
583 _tmp3 = _mm_add_ss(_tmp3,jz3); \
584 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
585 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
586 _tmp6 = _mm_add_ss(_tmp6,_tmp14); \
587 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
588 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
589 _tmp9 = _mm_add_ss(_tmp9,_tmp15); \
590 _mm_storeu_ps(ptr1,_tmp1); \
591 _mm_storeu_ps(ptr1+4,_tmp2); \
592 _mm_store_ss(ptr1+8,_tmp3); \
593 _mm_storeu_ps(ptr2,_tmp4); \
594 _mm_storeu_ps(ptr2+4,_tmp5); \
595 _mm_store_ss(ptr2+8,_tmp6); \
596 _mm_storeu_ps(ptr3,_tmp7); \
597 _mm_storeu_ps(ptr3+4,_tmp8); \
598 _mm_store_ss(ptr3+8,_tmp9); \
602 #define GMX_MM_UPDATE_3JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
603 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
604 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21; \
605 _tmp1 = _mm_loadu_ps(ptr1); \
606 _tmp2 = _mm_loadu_ps(ptr1+4); \
607 _tmp3 = _mm_loadu_ps(ptr1+8); \
608 _tmp4 = _mm_loadu_ps(ptr2); \
609 _tmp5 = _mm_loadu_ps(ptr2+4); \
610 _tmp6 = _mm_loadu_ps(ptr2+8); \
611 _tmp7 = _mm_loadu_ps(ptr3); \
612 _tmp8 = _mm_loadu_ps(ptr3+4); \
613 _tmp9 = _mm_loadu_ps(ptr3+8); \
614 _tmp10 = _mm_unpackhi_ps(jx1,jy1); \
615 jx1 = _mm_unpacklo_ps(jx1,jy1); \
616 _tmp11 = _mm_unpackhi_ps(jz1,jx2); \
617 jz1 = _mm_unpacklo_ps(jz1,jx2); \
618 _tmp12 = _mm_unpackhi_ps(jy2,jz2); \
619 jy2 = _mm_unpacklo_ps(jy2,jz2); \
620 _tmp13 = _mm_unpackhi_ps(jx3,jy3); \
621 jx3 = _mm_unpacklo_ps(jx3,jy3); \
622 _tmp14 = _mm_unpackhi_ps(jz3,jx4); \
623 jz3 = _mm_unpacklo_ps(jz3,jx4); \
624 _tmp15 = _mm_unpackhi_ps(jy4,jz4); \
625 jy4 = _mm_unpacklo_ps(jy4,jz4); \
626 _tmp16 = _mm_movelh_ps(jx1,jz1); \
627 _tmp17 = _mm_movehl_ps(jz1,jx1); \
628 _tmp10 = _mm_movelh_ps(_tmp10,_tmp11); \
629 _tmp18 = _mm_movelh_ps(jy2,jx3); \
630 _tmp19 = _mm_movehl_ps(jx3,jy2); \
631 _tmp12 = _mm_movelh_ps(_tmp12,_tmp13); \
632 _tmp20 = _mm_movelh_ps(jz3,jy4); \
633 _tmp21 = _mm_movehl_ps(jy4,jz3); \
634 _tmp14 = _mm_movelh_ps(_tmp14,_tmp15); \
635 _tmp1 = _mm_add_ps(_tmp1,_tmp16); \
636 _tmp2 = _mm_add_ps(_tmp2,_tmp18); \
637 _tmp3 = _mm_add_ps(_tmp3,_tmp20); \
638 _tmp4 = _mm_add_ps(_tmp4,_tmp17); \
639 _tmp5 = _mm_add_ps(_tmp5,_tmp19); \
640 _tmp6 = _mm_add_ps(_tmp6,_tmp21); \
641 _tmp7 = _mm_add_ps(_tmp7,_tmp10); \
642 _tmp8 = _mm_add_ps(_tmp8,_tmp12); \
643 _tmp9 = _mm_add_ps(_tmp9,_tmp14); \
644 _mm_storeu_ps(ptr1,_tmp1); \
645 _mm_storeu_ps(ptr1+4,_tmp2); \
646 _mm_storeu_ps(ptr1+8,_tmp3); \
647 _mm_storeu_ps(ptr2,_tmp4); \
648 _mm_storeu_ps(ptr2+4,_tmp5); \
649 _mm_storeu_ps(ptr2+8,_tmp6); \
650 _mm_storeu_ps(ptr3,_tmp7); \
651 _mm_storeu_ps(ptr3+4,_tmp8); \
652 _mm_storeu_ps(ptr3+8,_tmp9); \
657 #define GMX_MM_UPDATE_4JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
658 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
659 _tmp1 = _mm_load_ss(ptr1); \
660 _tmp1 = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1)); \
661 _tmp2 = _mm_load_ss(ptr2); \
662 _tmp2 = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1)); \
663 _tmp3 = _mm_load_ss(ptr3); \
664 _tmp3 = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1)); \
665 _tmp4 = _mm_load_ss(ptr4); \
666 _tmp4 = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1)); \
667 _tmp5 = _mm_unpacklo_ps(jy1,jz1); \
668 _tmp6 = _mm_unpackhi_ps(jy1,jz1); \
669 _tmp7 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
670 _tmp8 = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
671 _tmp9 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
672 _tmp10 = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
673 _tmp1 = _mm_add_ps(_tmp1,_tmp7); \
674 _tmp2 = _mm_add_ps(_tmp2,_tmp8); \
675 _tmp3 = _mm_add_ps(_tmp3,_tmp9); \
676 _tmp4 = _mm_add_ps(_tmp4,_tmp10); \
677 _mm_store_ss(ptr1,_tmp1); \
678 _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1); \
679 _mm_store_ss(ptr2,_tmp2); \
680 _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2); \
681 _mm_store_ss(ptr3,_tmp3); \
682 _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3); \
683 _mm_store_ss(ptr4,_tmp4); \
684 _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4); \
688 #define GMX_MM_UPDATE_4JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
689 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
690 _tmp1 = _mm_loadu_ps(ptr1); \
691 _tmp2 = _mm_loadu_ps(ptr2); \
692 _tmp3 = _mm_loadu_ps(ptr3); \
693 _tmp4 = _mm_loadu_ps(ptr4); \
694 _tmp5 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
695 _tmp5 = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4)); \
696 _tmp6 = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4)); \
697 _tmp6 = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4)); \
698 _tmp7 = _mm_unpackhi_ps(jx1,jy1); \
699 jx1 = _mm_unpacklo_ps(jx1,jy1); \
700 _tmp8 = _mm_unpackhi_ps(jz1,jx2); \
701 jz1 = _mm_unpacklo_ps(jz1,jx2); \
702 _tmp9 = _mm_unpackhi_ps(jy2,jz2); \
703 jy2 = _mm_unpacklo_ps(jy2,jz2); \
704 _tmp10 = _mm_movelh_ps(jx1,jz1); \
705 _tmp11 = _mm_movehl_ps(jz1,jx1); \
706 _tmp12 = _mm_movelh_ps(_tmp7,_tmp8); \
707 _tmp13 = _mm_movehl_ps(_tmp8,_tmp7); \
708 _tmp1 = _mm_add_ps(_tmp1,_tmp10); \
709 _tmp2 = _mm_add_ps(_tmp2,_tmp11); \
710 _tmp3 = _mm_add_ps(_tmp3,_tmp12); \
711 _tmp4 = _mm_add_ps(_tmp4,_tmp13); \
712 _tmp5 = _mm_add_ps(_tmp5,jy2); \
713 _tmp6 = _mm_add_ps(_tmp6,_tmp9); \
714 _mm_storeu_ps(ptr1,_tmp1); \
715 _mm_storeu_ps(ptr2,_tmp2); \
716 _mm_storeu_ps(ptr3,_tmp3); \
717 _mm_storeu_ps(ptr4,_tmp4); \
718 _mm_storel_pi((__m64 *)(ptr1+4),_tmp5); \
719 _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5); \
720 _mm_storel_pi((__m64 *)(ptr3+4),_tmp6); \
721 _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6); \
725 #define GMX_MM_UPDATE_4JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
726 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10; \
727 __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19; \
728 __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25; \
729 _tmp1 = _mm_loadu_ps(ptr1); \
730 _tmp2 = _mm_loadu_ps(ptr1+4); \
731 _tmp3 = _mm_load_ss(ptr1+8); \
732 _tmp4 = _mm_loadu_ps(ptr2); \
733 _tmp5 = _mm_loadu_ps(ptr2+4); \
734 _tmp6 = _mm_load_ss(ptr2+8); \
735 _tmp7 = _mm_loadu_ps(ptr3); \
736 _tmp8 = _mm_loadu_ps(ptr3+4); \
737 _tmp9 = _mm_load_ss(ptr3+8); \
738 _tmp10 = _mm_loadu_ps(ptr4); \
739 _tmp11 = _mm_loadu_ps(ptr4+4); \
740 _tmp12 = _mm_load_ss(ptr4+8); \
741 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
742 jx1 = _mm_unpacklo_ps(jx1,jy1); \
743 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
744 jz1 = _mm_unpacklo_ps(jz1,jx2); \
745 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
746 jy2 = _mm_unpacklo_ps(jy2,jz2); \
747 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
748 jx3 = _mm_unpacklo_ps(jx3,jy3); \
749 _tmp17 = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1)); \
750 _tmp18 = _mm_movehl_ps(jz3,jz3); \
751 _tmp19 = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1)); \
752 _tmp20 = _mm_movelh_ps(jx1,jz1); \
753 _tmp21 = _mm_movehl_ps(jz1,jx1); \
754 _tmp22 = _mm_movelh_ps(_tmp13,_tmp14); \
755 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
756 _tmp23 = _mm_movelh_ps(jy2,jx3); \
757 _tmp24 = _mm_movehl_ps(jx3,jy2); \
758 _tmp25 = _mm_movelh_ps(_tmp15,_tmp16); \
759 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
760 _tmp1 = _mm_add_ps(_tmp1,_tmp20); \
761 _tmp2 = _mm_add_ps(_tmp2,_tmp23); \
762 _tmp3 = _mm_add_ss(_tmp3,jz3); \
763 _tmp4 = _mm_add_ps(_tmp4,_tmp21); \
764 _tmp5 = _mm_add_ps(_tmp5,_tmp24); \
765 _tmp6 = _mm_add_ss(_tmp6,_tmp17); \
766 _tmp7 = _mm_add_ps(_tmp7,_tmp22); \
767 _tmp8 = _mm_add_ps(_tmp8,_tmp25); \
768 _tmp9 = _mm_add_ss(_tmp9,_tmp18); \
769 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
770 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
771 _tmp12 = _mm_add_ss(_tmp12,_tmp19); \
772 _mm_storeu_ps(ptr1,_tmp1); \
773 _mm_storeu_ps(ptr1+4,_tmp2); \
774 _mm_store_ss(ptr1+8,_tmp3); \
775 _mm_storeu_ps(ptr2,_tmp4); \
776 _mm_storeu_ps(ptr2+4,_tmp5); \
777 _mm_store_ss(ptr2+8,_tmp6); \
778 _mm_storeu_ps(ptr3,_tmp7); \
779 _mm_storeu_ps(ptr3+4,_tmp8); \
780 _mm_store_ss(ptr3+8,_tmp9); \
781 _mm_storeu_ps(ptr4,_tmp10); \
782 _mm_storeu_ps(ptr4+4,_tmp11); \
783 _mm_store_ss(ptr4+8,_tmp12); \
787 #define GMX_MM_UPDATE_4JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
788 __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11; \
789 __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
790 __m128 _tmp23,_tmp24; \
791 _tmp1 = _mm_loadu_ps(ptr1); \
792 _tmp2 = _mm_loadu_ps(ptr1+4); \
793 _tmp3 = _mm_loadu_ps(ptr1+8); \
794 _tmp4 = _mm_loadu_ps(ptr2); \
795 _tmp5 = _mm_loadu_ps(ptr2+4); \
796 _tmp6 = _mm_loadu_ps(ptr2+8); \
797 _tmp7 = _mm_loadu_ps(ptr3); \
798 _tmp8 = _mm_loadu_ps(ptr3+4); \
799 _tmp9 = _mm_loadu_ps(ptr3+8); \
800 _tmp10 = _mm_loadu_ps(ptr4); \
801 _tmp11 = _mm_loadu_ps(ptr4+4); \
802 _tmp12 = _mm_loadu_ps(ptr4+8); \
803 _tmp13 = _mm_unpackhi_ps(jx1,jy1); \
804 jx1 = _mm_unpacklo_ps(jx1,jy1); \
805 _tmp14 = _mm_unpackhi_ps(jz1,jx2); \
806 jz1 = _mm_unpacklo_ps(jz1,jx2); \
807 _tmp15 = _mm_unpackhi_ps(jy2,jz2); \
808 jy2 = _mm_unpacklo_ps(jy2,jz2); \
809 _tmp16 = _mm_unpackhi_ps(jx3,jy3); \
810 jx3 = _mm_unpacklo_ps(jx3,jy3); \
811 _tmp17 = _mm_unpackhi_ps(jz3,jx4); \
812 jz3 = _mm_unpacklo_ps(jz3,jx4); \
813 _tmp18 = _mm_unpackhi_ps(jy4,jz4); \
814 jy4 = _mm_unpacklo_ps(jy4,jz4); \
815 _tmp19 = _mm_movelh_ps(jx1,jz1); \
816 jz1 = _mm_movehl_ps(jz1,jx1); \
817 _tmp20 = _mm_movelh_ps(_tmp13,_tmp14); \
818 _tmp14 = _mm_movehl_ps(_tmp14,_tmp13); \
819 _tmp21 = _mm_movelh_ps(jy2,jx3); \
820 jx3 = _mm_movehl_ps(jx3,jy2); \
821 _tmp22 = _mm_movelh_ps(_tmp15,_tmp16); \
822 _tmp16 = _mm_movehl_ps(_tmp16,_tmp15); \
823 _tmp23 = _mm_movelh_ps(jz3,jy4); \
824 jy4 = _mm_movehl_ps(jy4,jz3); \
825 _tmp24 = _mm_movelh_ps(_tmp17,_tmp18); \
826 _tmp18 = _mm_movehl_ps(_tmp18,_tmp17); \
827 _tmp1 = _mm_add_ps(_tmp1,_tmp19); \
828 _tmp2 = _mm_add_ps(_tmp2,_tmp21); \
829 _tmp3 = _mm_add_ps(_tmp3,_tmp23); \
830 _tmp4 = _mm_add_ps(_tmp4,jz1); \
831 _tmp5 = _mm_add_ps(_tmp5,jx3); \
832 _tmp6 = _mm_add_ps(_tmp6,jy4); \
833 _tmp7 = _mm_add_ps(_tmp7,_tmp20); \
834 _tmp8 = _mm_add_ps(_tmp8,_tmp22); \
835 _tmp9 = _mm_add_ps(_tmp9,_tmp24); \
836 _tmp10 = _mm_add_ps(_tmp10,_tmp14); \
837 _tmp11 = _mm_add_ps(_tmp11,_tmp16); \
838 _tmp12 = _mm_add_ps(_tmp12,_tmp18); \
839 _mm_storeu_ps(ptr1,_tmp1); \
840 _mm_storeu_ps(ptr1+4,_tmp2); \
841 _mm_storeu_ps(ptr1+8,_tmp3); \
842 _mm_storeu_ps(ptr2,_tmp4); \
843 _mm_storeu_ps(ptr2+4,_tmp5); \
844 _mm_storeu_ps(ptr2+8,_tmp6); \
845 _mm_storeu_ps(ptr3,_tmp7); \
846 _mm_storeu_ps(ptr3+4,_tmp8); \
847 _mm_storeu_ps(ptr3+8,_tmp9); \
848 _mm_storeu_ps(ptr4,_tmp10); \
849 _mm_storeu_ps(ptr4+4,_tmp11); \
850 _mm_storeu_ps(ptr4+8,_tmp12); \
856 gmx_mm_scalarprod_ps(__m128 x, __m128 y, __m128 z)
858 return _mm_add_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y)),_mm_mul_ps(z,z));
863 gmx_mm_recip_ps(__m128 x)
865 const __m128 two = {2.0,2.0,2.0,2.0};
867 __m128 lu = _mm_rcp_ps(x);
868 return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(x,lu)));
874 gmx_mm_invsqrt_ps(__m128 x)
876 const __m128 half = {0.5,0.5,0.5,0.5};
877 const __m128 three = {3.0,3.0,3.0,3.0};
879 __m128 lu = _mm_rsqrt_ps(x);
881 return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
885 /* Routine to be called with rswitch/rcut at the beginning of a kernel
886 * to set up the 7 constants used for analytic 5th order switch calculations.
889 gmx_mm_setup_switch5_constants_ps(__m128 rswitch, __m128 rcut,
890 __m128 *switch_C3, __m128 *switch_C4, __m128 *switch_C5,
891 __m128 *switch_D2, __m128 *switch_D3, __m128 *switch_D4)
893 const __m128 cm6 = { -6.0, -6.0, -6.0, -6.0};
894 const __m128 cm10 = {-10.0,-10.0,-10.0,-10.0};
895 const __m128 c15 = { 15.0, 15.0, 15.0, 15.0};
896 const __m128 cm30 = {-30.0,-30.0,-30.0,-30.0};
897 const __m128 c60 = { 60.0, 60.0, 60.0, 60.0};
899 __m128 d,dinv,dinv2,dinv3,dinv4,dinv5;
901 d = _mm_sub_ps(rcut,rswitch);
902 dinv = gmx_mm_recip_ps(d);
903 dinv2 = _mm_mul_ps(dinv,dinv);
904 dinv3 = _mm_mul_ps(dinv2,dinv);
905 dinv4 = _mm_mul_ps(dinv2,dinv2);
906 dinv5 = _mm_mul_ps(dinv3,dinv2);
908 *switch_C3 = _mm_mul_ps(cm10,dinv3);
909 *switch_C4 = _mm_mul_ps(c15,dinv4);
910 *switch_C5 = _mm_mul_ps(cm6,dinv5);
911 *switch_D2 = _mm_mul_ps(cm30,dinv3);
912 *switch_D3 = _mm_mul_ps(c60,dinv4);
913 *switch_D4 = _mm_mul_ps(cm30,dinv5);
917 #define GMX_MM_SET_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
918 const __m128 _sw_one = { 1.0, 1.0, 1.0, 1.0}; \
920 d = _mm_max_ps(r,rswitch); \
921 d = _mm_min_ps(d,rcut); \
922 d = _mm_sub_ps(d,rswitch); \
923 d2 = _mm_mul_ps(d,d); \
924 sw = _mm_mul_ps(d,sw_C5); \
925 dsw = _mm_mul_ps(d,sw_D4); \
926 sw = _mm_add_ps(sw,sw_C4); \
927 dsw = _mm_add_ps(dsw,sw_D3); \
928 sw = _mm_mul_ps(sw,d); \
929 dsw = _mm_mul_ps(dsw,d); \
930 sw = _mm_add_ps(sw,sw_C3); \
931 dsw = _mm_add_ps(dsw,sw_D2); \
932 sw = _mm_mul_ps(sw,_mm_mul_ps(d,d2)); \
933 dsw = _mm_mul_ps(dsw,d2); \
934 sw = _mm_add_ps(sw,_sw_one); \
938 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
940 gmx_mm_int_coulomb_ps(__m128 rinv, __m128 qq,__m128 *vctot)
942 __m128 vcoul = _mm_mul_ps(qq,rinv);
943 *vctot = _mm_add_ps(*vctot,vcoul);
949 gmx_mm_int_coulomb_noforce_ps(__m128 rinv, __m128 qq,__m128 *vctot)
951 __m128 vcoul = _mm_mul_ps(qq,rinv);
952 *vctot = _mm_add_ps(*vctot,vcoul);
956 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
958 gmx_mm_int_coulombrf_ps(const __m128 rinv, const __m128 rsq, const __m128 krf, const __m128 crf, const __m128 qq,__m128 *vctot)
960 const __m128 two = {2.0,2.0,2.0,2.0};
963 krsq = _mm_mul_ps(krf,rsq);
964 vcoul = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
965 *vctot = _mm_add_ps(*vctot,vcoul);
967 return _mm_mul_ps(qq, _mm_sub_ps(rinv, _mm_mul_ps(two,krsq)));
972 gmx_mm_int_coulombrf_noforce_ps(__m128 rinv, __m128 rsq, __m128 krf, __m128 crf, __m128 qq,__m128 *vctot)
976 krsq = _mm_mul_ps(krf,rsq);
977 vcoul = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
978 *vctot = _mm_add_ps(*vctot,vcoul);
991 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
993 gmx_mm_int_lj_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
995 const __m128 six = {6.0,6.0,6.0,6.0};
996 const __m128 twelve = {12.0,12.0,12.0,12.0};
998 __m128 rinvsix,vvdw6,vvdw12;
1000 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
1001 vvdw6 = _mm_mul_ps(c6,rinvsix);
1002 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
1003 *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
1005 return _mm_sub_ps( _mm_mul_ps(twelve,vvdw12),_mm_mul_ps(six,vvdw6));
1010 gmx_mm_int_lj_potonly_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
1012 __m128 rinvsix,vvdw6,vvdw12;
1014 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
1015 vvdw6 = _mm_mul_ps(c6,rinvsix);
1016 vvdw12 = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
1017 *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
1022 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
1025 /* Return force should be multiplied by -rinv to get fscal */
1026 static inline __m128
1027 gmx_mm_int_4_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1029 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
1031 int n_a,n_b,n_c,n_d;
1033 rt = _mm_mul_ps(r,tabscale);
1034 n0 = _mm_cvttps_epi32(rt);
1035 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1036 eps2 = _mm_mul_ps(eps,eps);
1038 /* Extract indices from n0 */
1039 n_a = gmx_mm_extract_epi32(n0,0);
1040 n_b = gmx_mm_extract_epi32(n0,1);
1041 n_c = gmx_mm_extract_epi32(n0,2);
1042 n_d = gmx_mm_extract_epi32(n0,3);
1043 Y = _mm_load_ps(VFtab + 4* n_a);
1044 F = _mm_load_ps(VFtab + 4* n_b);
1045 G = _mm_load_ps(VFtab + 4* n_c);
1046 H = _mm_load_ps(VFtab + 4* n_d);
1047 _MM_TRANSPOSE4_PS(Y,F,G,H);
1048 H = _mm_mul_ps(H,eps2); /* Heps2 */
1049 G = _mm_mul_ps(G,eps); /* Geps */
1050 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
1051 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1052 *vctot = _mm_add_ps(*vctot,vcoul);
1054 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1056 return _mm_mul_ps(F,tabscale);
1061 /* Return force should be multiplied by -rinv to get fscal */
1062 static inline __m128
1063 gmx_mm_int_4_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1065 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1067 int n_a,n_b,n_c,n_d;
1069 rt = _mm_mul_ps(r,tabscale);
1070 n0 = _mm_cvttps_epi32(rt);
1071 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1072 eps2 = _mm_mul_ps(eps,eps);
1074 /* Extract indices from n0 */
1075 n_a = gmx_mm_extract_epi32(n0,0);
1076 n_b = gmx_mm_extract_epi32(n0,1);
1077 n_c = gmx_mm_extract_epi32(n0,2);
1078 n_d = gmx_mm_extract_epi32(n0,3);
1080 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1081 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1082 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1084 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset);
1085 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset);
1086 Gd = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset);
1087 Hd = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset);
1088 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset + 4);
1089 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset + 4);
1090 Gr = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset + 4);
1091 Hr = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset + 4);
1092 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1093 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1094 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1095 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1096 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1097 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1098 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1099 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1100 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1101 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1102 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1104 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1105 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1107 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1111 /* Return force should be multiplied by -rinv to get fscal */
1112 static inline __m128
1113 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
1114 __m128 *vctot, __m128 *vvdwtot)
1116 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1118 int n_a,n_b,n_c,n_d;
1120 rt = _mm_mul_ps(r,tabscale);
1121 n0 = _mm_cvttps_epi32(rt);
1122 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1123 eps2 = _mm_mul_ps(eps,eps);
1125 /* Extract indices from n0 */
1126 n_a = gmx_mm_extract_epi32(n0,0);
1127 n_b = gmx_mm_extract_epi32(n0,1);
1128 n_c = gmx_mm_extract_epi32(n0,2);
1129 n_d = gmx_mm_extract_epi32(n0,3);
1132 Yc = _mm_load_ps(VFtab + 12* n_a);
1133 Fc = _mm_load_ps(VFtab + 12* n_b);
1134 Gc = _mm_load_ps(VFtab + 12* n_c);
1135 Hc = _mm_load_ps(VFtab + 12* n_d);
1136 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
1137 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
1138 Gd = _mm_load_ps(VFtab + 12* n_c + 4);
1139 Hd = _mm_load_ps(VFtab + 12* n_d + 4);
1140 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
1141 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
1142 Gr = _mm_load_ps(VFtab + 12* n_c + 8);
1143 Hr = _mm_load_ps(VFtab + 12* n_d + 8);
1144 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1145 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1146 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1147 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
1148 Gc = _mm_mul_ps(Gc,eps); /* Geps */
1149 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
1150 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1151 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1152 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1153 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1154 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1155 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1157 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1158 *vctot = _mm_add_ps(*vctot,vcoul);
1160 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1161 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1162 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1164 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1165 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1166 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1168 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1173 /* Return force should be multiplied by -rinv to get fscal */
1174 static inline __m128
1175 gmx_mm_int_3_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1177 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
1181 rt = _mm_mul_ps(r,tabscale);
1182 n0 = _mm_cvttps_epi32(rt);
1183 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1184 eps2 = _mm_mul_ps(eps,eps);
1186 /* Extract indices from n0 */
1187 n_a = gmx_mm_extract_epi32(n0,0);
1188 n_b = gmx_mm_extract_epi32(n0,1);
1189 n_c = gmx_mm_extract_epi32(n0,2);
1190 Y = _mm_load_ps(VFtab + 4* n_a);
1191 F = _mm_load_ps(VFtab + 4* n_b);
1192 G = _mm_load_ps(VFtab + 4* n_c);
1193 H = _mm_setzero_ps();
1194 _MM_TRANSPOSE4_PS(Y,F,G,H);
1195 H = _mm_mul_ps(H,eps2); /* Heps2 */
1196 G = _mm_mul_ps(G,eps); /* Geps */
1197 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
1198 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1199 *vctot = _mm_add_ps(*vctot,vcoul);
1201 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1203 return _mm_mul_ps(F,tabscale);
1208 /* Return force should be multiplied by -rinv to get fscal */
1209 static inline __m128
1210 gmx_mm_int_3_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1212 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1216 rt = _mm_mul_ps(r,tabscale);
1217 n0 = _mm_cvttps_epi32(rt);
1218 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1219 eps2 = _mm_mul_ps(eps,eps);
1221 /* Extract indices from n0 */
1222 n_a = gmx_mm_extract_epi32(n0,0);
1223 n_b = gmx_mm_extract_epi32(n0,1);
1224 n_c = gmx_mm_extract_epi32(n0,2);
1226 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1227 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1228 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1230 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1231 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
1232 Gd = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset);
1233 Hd = _mm_setzero_ps();
1234 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1235 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
1236 Gr = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset + 4);
1237 Hr = _mm_setzero_ps();
1238 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1239 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1240 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1241 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1242 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1243 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1244 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1245 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1246 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1247 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1248 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1250 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1251 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1253 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1257 /* Return force should be multiplied by -rinv to get fscal */
1258 static inline __m128
1259 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
1260 __m128 *vctot, __m128 *vvdwtot)
1262 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1266 rt = _mm_mul_ps(r,tabscale);
1267 n0 = _mm_cvttps_epi32(rt);
1268 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1269 eps2 = _mm_mul_ps(eps,eps);
1271 /* Extract indices from n0 */
1272 n_a = gmx_mm_extract_epi32(n0,0);
1273 n_b = gmx_mm_extract_epi32(n0,1);
1274 n_c = gmx_mm_extract_epi32(n0,2);
1277 Yc = _mm_load_ps(VFtab + 12* n_a);
1278 Fc = _mm_load_ps(VFtab + 12* n_b);
1279 Gc = _mm_load_ps(VFtab + 12* n_c);
1280 Hc = _mm_setzero_ps();
1281 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
1282 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
1283 Gd = _mm_load_ps(VFtab + 12* n_c + 4);
1284 Hd = _mm_setzero_ps();
1285 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
1286 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
1287 Gr = _mm_load_ps(VFtab + 12* n_c + 8);
1288 Hr = _mm_setzero_ps();
1289 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1290 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1291 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1292 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
1293 Gc = _mm_mul_ps(Gc,eps); /* Geps */
1294 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
1295 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1296 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1297 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1298 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1299 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1300 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1302 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1303 *vctot = _mm_add_ps(*vctot,vcoul);
1305 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1306 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1307 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1309 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1310 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1311 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1313 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1320 /* Return force should be multiplied by -rinv to get fscal */
1321 static inline __m128
1322 gmx_mm_int_2_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1324 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
1328 rt = _mm_mul_ps(r,tabscale);
1329 n0 = _mm_cvttps_epi32(rt);
1330 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1331 eps2 = _mm_mul_ps(eps,eps);
1333 /* Extract indices from n0 */
1334 n_a = gmx_mm_extract_epi32(n0,0);
1335 n_b = gmx_mm_extract_epi32(n0,1);
1336 Y = _mm_load_ps(VFtab + 4* n_a);
1337 F = _mm_load_ps(VFtab + 4* n_b);
1338 G = _mm_setzero_ps();
1339 H = _mm_setzero_ps();
1340 _MM_TRANSPOSE4_PS(Y,F,G,H);
1341 H = _mm_mul_ps(H,eps2); /* Heps2 */
1342 G = _mm_mul_ps(G,eps); /* Geps */
1343 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
1344 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1345 *vctot = _mm_add_ps(*vctot,vcoul);
1347 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1349 return _mm_mul_ps(F,tabscale);
1354 /* Return force should be multiplied by -rinv to get fscal */
1355 static inline __m128
1356 gmx_mm_int_2_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1358 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1362 rt = _mm_mul_ps(r,tabscale);
1363 n0 = _mm_cvttps_epi32(rt);
1364 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1365 eps2 = _mm_mul_ps(eps,eps);
1367 /* Extract indices from n0 */
1368 n_a = gmx_mm_extract_epi32(n0,0);
1369 n_b = gmx_mm_extract_epi32(n0,1);
1371 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1372 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1373 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1375 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1376 Fd = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
1377 Gd = _mm_setzero_ps();
1378 Hd = _mm_setzero_ps();
1379 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1380 Fr = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
1381 Gr = _mm_setzero_ps();
1382 Hr = _mm_setzero_ps();
1383 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1384 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1385 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1386 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1387 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1388 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1389 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1390 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1391 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1392 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1393 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1395 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1396 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1398 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1402 /* Return force should be multiplied by -rinv to get fscal */
1403 static inline __m128
1404 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
1405 __m128 *vctot, __m128 *vvdwtot)
1407 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1411 rt = _mm_mul_ps(r,tabscale);
1412 n0 = _mm_cvttps_epi32(rt);
1413 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1414 eps2 = _mm_mul_ps(eps,eps);
1416 /* Extract indices from n0 */
1417 n_a = gmx_mm_extract_epi32(n0,0);
1418 n_b = gmx_mm_extract_epi32(n0,1);
1420 Yc = _mm_load_ps(VFtab + 12* n_a);
1421 Fc = _mm_load_ps(VFtab + 12* n_b);
1422 Gc = _mm_setzero_ps();
1423 Hc = _mm_setzero_ps();
1424 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
1425 Fd = _mm_load_ps(VFtab + 12* n_b + 4);
1426 Gd = _mm_setzero_ps();
1427 Hd = _mm_setzero_ps();
1428 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
1429 Fr = _mm_load_ps(VFtab + 12* n_b + 8);
1430 Gr = _mm_setzero_ps();
1431 Hr = _mm_setzero_ps();
1432 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1433 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1434 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1435 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
1436 Gc = _mm_mul_ps(Gc,eps); /* Geps */
1437 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
1438 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1439 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1440 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1441 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1442 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1443 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1445 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1446 *vctot = _mm_add_ps(*vctot,vcoul);
1448 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1449 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1450 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1452 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1453 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1454 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1456 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1462 /* Return force should be multiplied by -rinv to get fscal */
1463 static inline __m128
1464 gmx_mm_int_1_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1466 __m128 rt,eps,eps2,Y,F,G,H,vcoul;
1470 rt = _mm_mul_ps(r,tabscale);
1471 n0 = _mm_cvttps_epi32(rt);
1472 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1473 eps2 = _mm_mul_ps(eps,eps);
1475 /* Extract indices from n0 */
1476 n_a = gmx_mm_extract_epi32(n0,0);
1477 Y = _mm_load_ps(VFtab + 4* n_a);
1478 F = _mm_setzero_ps();
1479 G = _mm_setzero_ps();
1480 H = _mm_setzero_ps();
1481 _MM_TRANSPOSE4_PS(Y,F,G,H);
1482 H = _mm_mul_ps(H,eps2); /* Heps2 */
1483 G = _mm_mul_ps(G,eps); /* Geps */
1484 F = _mm_add_ps(F, _mm_add_ps(G,H)); /* Fp */
1485 vcoul = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1486 *vctot = _mm_add_ps(*vctot,vcoul);
1488 F = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1490 return _mm_mul_ps(F,tabscale);
1495 /* Return force should be multiplied by -rinv to get fscal */
1496 static inline __m128
1497 gmx_mm_int_1_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1499 __m128 rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1503 rt = _mm_mul_ps(r,tabscale);
1504 n0 = _mm_cvttps_epi32(rt);
1505 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1506 eps2 = _mm_mul_ps(eps,eps);
1508 /* Extract indices from n0 */
1509 n_a = gmx_mm_extract_epi32(n0,0);
1511 /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1512 * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1513 * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1515 Yd = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1516 Fd = _mm_setzero_ps();
1517 Gd = _mm_setzero_ps();
1518 Hd = _mm_setzero_ps();
1519 Yr = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1520 Fr = _mm_setzero_ps();
1521 Gr = _mm_setzero_ps();
1522 Hr = _mm_setzero_ps();
1523 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1524 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1525 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1526 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1527 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1528 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1529 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1530 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1531 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1532 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1533 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1535 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1536 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1538 return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1542 /* Return force should be multiplied by -rinv to get fscal */
1543 static inline __m128
1544 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
1545 __m128 *vctot, __m128 *vvdwtot)
1547 __m128 rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1551 rt = _mm_mul_ps(r,tabscale);
1552 n0 = _mm_cvttps_epi32(rt);
1553 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1554 eps2 = _mm_mul_ps(eps,eps);
1556 /* Extract indices from n0 */
1557 n_a = gmx_mm_extract_epi32(n0,0);
1559 Yc = _mm_load_ps(VFtab + 12* n_a);
1560 Fc = _mm_setzero_ps();
1561 Gc = _mm_setzero_ps();
1562 Hc = _mm_setzero_ps();
1563 Yd = _mm_load_ps(VFtab + 12* n_a + 4);
1564 Fd = _mm_setzero_ps();
1565 Gd = _mm_setzero_ps();
1566 Hd = _mm_setzero_ps();
1567 Yr = _mm_load_ps(VFtab + 12* n_a + 8);
1568 Fr = _mm_setzero_ps();
1569 Gr = _mm_setzero_ps();
1570 Hr = _mm_setzero_ps();
1571 _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1572 _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1573 _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1574 Hc = _mm_mul_ps(Hc,eps2); /* Heps2 */
1575 Gc = _mm_mul_ps(Gc,eps); /* Geps */
1576 Fc = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc)); /* Fp */
1577 Hd = _mm_mul_ps(Hd,eps2); /* Heps2 */
1578 Gd = _mm_mul_ps(Gd,eps); /* Geps */
1579 Fd = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd)); /* Fp */
1580 Hr = _mm_mul_ps(Hr,eps2); /* Heps2 */
1581 Gr = _mm_mul_ps(Gr,eps); /* Geps */
1582 Fr = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr)); /* Fp */
1584 vcoul = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1585 *vctot = _mm_add_ps(*vctot,vcoul);
1587 vvdw6 = _mm_mul_ps(c6, _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1588 vvdw12 = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1589 *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1591 Fc = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1592 Fd = _mm_mul_ps(c6, _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1593 Fr = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1595 return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1602 /* Return force should be multiplied by +rinv to get fscal */
1603 static inline __m128
1604 gmx_mm_int_4_genborn_ps(__m128 r, __m128 isai,
1605 float * isaj1, float *isaj2, float *isaj3, float *isaj4,
1606 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
1607 float *dvdaj1, float *dvdaj2, float *dvdaj3, float *dvdaj4,
1610 const __m128 half = {0.5,0.5,0.5,0.5};
1612 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1614 int n_a,n_b,n_c,n_d;
1617 isaj = _mm_load_ss(isaj1);
1618 t2 = _mm_load_ss(isaj2);
1619 t3 = _mm_load_ss(isaj3);
1620 t4 = _mm_load_ss(isaj4);
1621 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
1622 t3 = _mm_unpacklo_ps(t3,t4); /* - - t4 t3 */
1623 isaj = _mm_movelh_ps(isaj,t3); /* t4 t3 t2 t1 */
1625 isaprod = _mm_mul_ps(isai,isaj);
1626 qq = _mm_mul_ps(qq,isaprod);
1627 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
1629 rt = _mm_mul_ps(r,gbtabscale);
1630 n0 = _mm_cvttps_epi32(rt);
1631 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1632 eps2 = _mm_mul_ps(eps,eps);
1634 /* Extract indices from n0 */
1635 n_a = gmx_mm_extract_epi32(n0,0);
1636 n_b = gmx_mm_extract_epi32(n0,1);
1637 n_c = gmx_mm_extract_epi32(n0,2);
1638 n_d = gmx_mm_extract_epi32(n0,3);
1639 Y = _mm_load_ps(GBtab + 4* n_a);
1640 F = _mm_load_ps(GBtab + 4* n_b);
1641 G = _mm_load_ps(GBtab + 4* n_c);
1642 H = _mm_load_ps(GBtab + 4* n_d);
1643 _MM_TRANSPOSE4_PS(Y,F,G,H);
1644 G = _mm_mul_ps(G,eps); /* Geps */
1645 H = _mm_mul_ps(H,eps2); /* Heps2 */
1646 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
1648 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1649 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1651 vgb = _mm_mul_ps(qq, VV);
1652 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1654 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1656 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1658 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1660 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1662 /* Update 4 dada[j] values */
1663 Y = _mm_load_ss(dvdaj1);
1664 F = _mm_load_ss(dvdaj2);
1665 G = _mm_load_ss(dvdaj3);
1666 H = _mm_load_ss(dvdaj4);
1667 t3 = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
1668 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1669 t4 = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,1));
1671 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1672 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1673 _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
1674 _mm_store_ss( dvdaj4 , _mm_add_ss( H, t4 ) );
1681 /* Return force should be multiplied by +rinv to get fscal */
1682 static inline __m128
1683 gmx_mm_int_3_genborn_ps(__m128 r, __m128 isai,
1684 float * isaj1, float *isaj2, float *isaj3,
1685 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
1686 float *dvdaj1, float *dvdaj2, float *dvdaj3,
1689 const __m128 half = {0.5,0.5,0.5,0.5};
1691 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1693 int n_a,n_b,n_c,n_d;
1696 isaj = _mm_load_ss(isaj1);
1697 t2 = _mm_load_ss(isaj2);
1698 t3 = _mm_load_ss(isaj3);
1699 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
1700 t3 = _mm_unpacklo_ps(t3,t3); /* - - t3 t3 */
1701 isaj = _mm_movelh_ps(isaj,t3); /* t3 t3 t2 t1 */
1703 isaprod = _mm_mul_ps(isai,isaj);
1704 qq = _mm_mul_ps(qq,isaprod);
1705 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
1707 rt = _mm_mul_ps(r,gbtabscale);
1708 n0 = _mm_cvttps_epi32(rt);
1709 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1710 eps2 = _mm_mul_ps(eps,eps);
1712 /* Extract indices from n0 */
1713 n_a = gmx_mm_extract_epi32(n0,0);
1714 n_b = gmx_mm_extract_epi32(n0,1);
1715 n_c = gmx_mm_extract_epi32(n0,2);
1716 Y = _mm_load_ps(GBtab + 4* n_a);
1717 F = _mm_load_ps(GBtab + 4* n_b);
1718 G = _mm_load_ps(GBtab + 4* n_c);
1719 H = _mm_setzero_ps();
1720 _MM_TRANSPOSE4_PS(Y,F,G,H);
1721 G = _mm_mul_ps(G,eps); /* Geps */
1722 H = _mm_mul_ps(H,eps2); /* Heps2 */
1723 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
1725 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1726 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1728 vgb = _mm_mul_ps(qq, VV);
1729 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1731 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1733 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1735 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1737 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1739 /* Update 3 dada[j] values */
1740 Y = _mm_load_ss(dvdaj1);
1741 F = _mm_load_ss(dvdaj2);
1742 G = _mm_load_ss(dvdaj3);
1743 t3 = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
1744 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1746 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1747 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1748 _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
1756 /* Return force should be multiplied by +rinv to get fscal */
1757 static inline __m128
1758 gmx_mm_int_2_genborn_ps(__m128 r, __m128 isai,
1759 float * isaj1, float *isaj2,
1760 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
1761 float *dvdaj1, float *dvdaj2,
1764 const __m128 half = {0.5,0.5,0.5,0.5};
1766 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1768 int n_a,n_b,n_c,n_d;
1771 isaj = _mm_load_ss(isaj1);
1772 t2 = _mm_load_ss(isaj2);
1773 isaj = _mm_unpacklo_ps(isaj,t2); /* - - t2 t1 */
1775 isaprod = _mm_mul_ps(isai,isaj);
1776 qq = _mm_mul_ps(qq,isaprod);
1777 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
1779 rt = _mm_mul_ps(r,gbtabscale);
1780 n0 = _mm_cvttps_epi32(rt);
1781 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1782 eps2 = _mm_mul_ps(eps,eps);
1784 /* Extract indices from n0 */
1785 n_a = gmx_mm_extract_epi32(n0,0);
1786 n_b = gmx_mm_extract_epi32(n0,1);
1787 Y = _mm_load_ps(GBtab + 4* n_a);
1788 F = _mm_load_ps(GBtab + 4* n_b);
1789 G = _mm_setzero_ps();
1790 H = _mm_setzero_ps();
1791 _MM_TRANSPOSE4_PS(Y,F,G,H);
1792 G = _mm_mul_ps(G,eps); /* Geps */
1793 H = _mm_mul_ps(H,eps2); /* Heps2 */
1794 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
1796 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1797 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1799 vgb = _mm_mul_ps(qq, VV);
1800 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1802 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1804 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1806 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1808 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1810 /* Update 2 dada[j] values */
1811 Y = _mm_load_ss(dvdaj1);
1812 F = _mm_load_ss(dvdaj2);
1813 t2 = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1815 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1816 _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1821 /* Return force should be multiplied by +rinv to get fscal */
1822 static inline __m128
1823 gmx_mm_int_1_genborn_ps(__m128 r, __m128 isai,
1825 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
1829 const __m128 half = {0.5,0.5,0.5,0.5};
1831 __m128 rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1833 int n_a,n_b,n_c,n_d;
1836 isaj = _mm_load_ss(isaj1);
1838 isaprod = _mm_mul_ps(isai,isaj);
1839 qq = _mm_mul_ps(qq,isaprod);
1840 gbtabscale = _mm_mul_ps( isaprod, gbtabscale );
1842 rt = _mm_mul_ps(r,gbtabscale);
1843 n0 = _mm_cvttps_epi32(rt);
1844 eps = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1845 eps2 = _mm_mul_ps(eps,eps);
1847 /* Extract indices from n0 */
1848 n_a = gmx_mm_extract_epi32(n0,0);
1849 Y = _mm_load_ps(GBtab + 4* n_a);
1850 F = _mm_setzero_ps();
1851 G = _mm_setzero_ps();
1852 H = _mm_setzero_ps();
1853 _MM_TRANSPOSE4_PS(Y,F,G,H);
1854 G = _mm_mul_ps(G,eps); /* Geps */
1855 H = _mm_mul_ps(H,eps2); /* Heps2 */
1856 F = _mm_add_ps(_mm_add_ps(F,G),H); /* Fp */
1858 VV = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1859 FF = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1861 vgb = _mm_mul_ps(qq, VV);
1862 *vgbtot = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1864 ftmp = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1866 dvdatmp = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1868 *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1870 dvdatmp = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1872 /* Update 1 dada[j] values */
1873 Y = _mm_load_ss(dvdaj1);
1875 _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1885 gmx_mm_update_iforce_1atom_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1891 /* transpose data */
1893 _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
1894 fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
1896 t2 = _mm_load_ss(fptr);
1897 t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
1898 t3 = _mm_load_ss(fshiftptr);
1899 t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
1901 t2 = _mm_add_ps(t2,fix1);
1902 t3 = _mm_add_ps(t3,fix1);
1904 _mm_store_ss(fptr,t2);
1905 _mm_storeh_pi((__m64 *)(fptr+1),t2);
1906 _mm_store_ss(fshiftptr,t3);
1907 _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
1911 gmx_mm_update_iforce_2atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1912 __m128 fix2, __m128 fiy2, __m128 fiz2,
1918 /* transpose data */
1919 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
1920 t1 = _mm_unpacklo_ps(fiy2,fiz2);
1921 t2 = _mm_unpackhi_ps(fiy2,fiz2);
1923 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
1924 t1 = _mm_add_ps(t1,t2);
1925 t2 = _mm_movehl_ps(t2,t1);
1926 fiy2 = _mm_add_ps(t1,t2);
1928 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
1929 t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
1930 _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
1932 t4 = _mm_load_ss(fshiftptr+2);
1933 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
1935 t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2)); /* fiy2 - fix2 fiz1 */
1936 t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0)); /* fiy2 fix2 - fiz1 */
1937 t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1)); /* fiy1 fix1 - fiz2 */
1939 t1 = _mm_add_ps(t1,t2);
1940 t3 = _mm_add_ps(t3,t4);
1941 t1 = _mm_add_ps(t1,t3); /* y x - z */
1943 _mm_store_ss(fshiftptr+2,t1);
1944 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
1950 gmx_mm_update_iforce_3atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1951 __m128 fix2, __m128 fiy2, __m128 fiz2,
1952 __m128 fix3, __m128 fiy3, __m128 fiz3,
1958 /* transpose data */
1959 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
1960 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
1961 t2 = _mm_movehl_ps(t2,fiz3);
1962 t1 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
1963 t3 = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
1965 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
1966 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
1967 fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1) , _mm_add_ps(t2,t3));
1969 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
1970 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
1971 _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
1973 t4 = _mm_load_ss(fshiftptr+2);
1974 t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
1976 t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0)); /* fiy1 fix1 - fiz3 */
1977 t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2)); /* fiy3 fix3 - fiz1 */
1978 t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1)); /* fix2 fix2 fiy2 fiz2 */
1979 t3 = _mm_shuffle_ps(t3 ,t3 ,_MM_SHUFFLE(1,2,0,0)); /* fiy2 fix2 - fiz2 */
1981 t1 = _mm_add_ps(t1,t2);
1982 t3 = _mm_add_ps(t3,t4);
1983 t1 = _mm_add_ps(t1,t3); /* y x - z */
1985 _mm_store_ss(fshiftptr+2,t1);
1986 _mm_storeh_pi((__m64 *)(fshiftptr),t1);
1990 gmx_mm_update_iforce_4atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1991 __m128 fix2, __m128 fiy2, __m128 fiz2,
1992 __m128 fix3, __m128 fiy3, __m128 fiz3,
1993 __m128 fix4, __m128 fiy4, __m128 fiz4,
1997 __m128 t1,t2,t3,t4,t5;
1999 /* transpose data */
2000 _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2001 _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
2002 _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
2004 fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2005 fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
2006 fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
2008 _mm_storeu_ps(fptr, _mm_add_ps(fix1,_mm_loadu_ps(fptr) ));
2009 _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
2010 _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
2012 t5 = _mm_load_ss(fshiftptr+2);
2013 t5 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2015 t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2)); /* fiy1 fix1 - fiz1 */
2016 t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1)); /* fiy3 fix3 - fiz2 */
2017 t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0)); /* fiy4 fix4 - fiz3 */
2018 t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3)); /* fiy2 fiy2 fix2 fix2 */
2019 t4 = _mm_shuffle_ps(fiz3,t4 ,_MM_SHUFFLE(2,0,3,3)); /* fiy2 fix2 - fiz4 */
2021 t1 = _mm_add_ps(t1,t2);
2022 t3 = _mm_add_ps(t3,t4);
2023 t1 = _mm_add_ps(t1,t3); /* y x - z */
2024 t5 = _mm_add_ps(t5,t1);
2026 _mm_store_ss(fshiftptr+2,t5);
2027 _mm_storeh_pi((__m64 *)(fshiftptr),t5);
2032 gmx_mm_update_1pot_ps(__m128 pot1, float *ptr1)
2034 pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1));
2035 pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
2037 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
2041 gmx_mm_update_2pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2)
2044 t1 = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */
2045 t2 = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */
2046 t1 = _mm_add_ps(t1,t2); /* 2 2 1 1 */
2047 t2 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
2048 pot1 = _mm_add_ps(t1,t2); /* - 2 - 1 */
2049 pot2 = _mm_movehl_ps(t2,t1); /* - - - 2 */
2051 _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
2052 _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));