Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / sse_common_single.h
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7  * Copyright (c) 2001-2009, The GROMACS Development Team
8  *
9  * Gromacs is a library for molecular simulation and trajectory analysis,
10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11  * a full list of developers and information, check out http://www.gromacs.org
12  *
13  * This program is free software; you can redistribute it and/or modify it under 
14  * the terms of the GNU Lesser General Public License as published by the Free 
15  * Software Foundation; either version 2 of the License, or (at your option) any 
16  * later version.
17  * As a special exception, you may use this file as part of a free software
18  * library without restriction.  Specifically, if other files instantiate
19  * templates or use macros or inline functions from this file, or you compile
20  * this file and link it with other files to produce an executable, this
21  * file does not by itself cause the resulting executable to be covered by
22  * the GNU Lesser General Public License.  
23  *
24  * In plain-speak: do not worry about classes/macros/templates either - only
25  * changes to the library have to be LGPL, not an application linking with it.
26  *
27  * To help fund GROMACS development, we humbly ask that you cite
28  * the papers people have written on it - you can find them on the website!
29  */
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
33
34 #include <emmintrin.h>
35
36 #include <stdio.h>
37   
38
39
40 static inline void
41 gmx_printxmm_ps(const char *s,__m128 xmm)
42 {
43         float f[4];
44         
45         _mm_storeu_ps(f,xmm);
46         printf("%s: %15.10g %15.10g %15.10g %15.10g\n",s,f[0],f[1],f[2],f[3]);  
47 }
48
49
50
51 #define GMX_MM_LOAD_1JCOORD_1ATOM_PS(ptr1,jx1,jy1,jz1) {              \
52          jx1            = _mm_load_ss(ptr1);                               \
53      jy1            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+1));\
54      jz1            = _mm_shuffle_ps(jy1,jy1,_MM_SHUFFLE(0,0,0,1));    \
55 }
56
57
58 #define GMX_MM_LOAD_1JCOORD_2ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
59          jx1            = _mm_loadu_ps(ptr1);                              \
60      jy1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1));    \
61      jz1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));    \
62      jx2            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3));    \
63      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));\
64      jz2            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1));    \
65 }
66
67
68 #define GMX_MM_LOAD_1JCOORD_3ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
69          jx1            = _mm_loadu_ps(ptr1);                                          \
70      jy1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1));                \
71      jz1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));                \
72      jx2            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3));                \
73      jy2            = _mm_loadu_ps(ptr1+4);                                        \
74      jz2            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1));                \
75      jx3            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,2));                \
76      jy3            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,3));                \
77      jz3            = _mm_load_ss(ptr1+8);                                         \
78 }
79
80
81 #define GMX_MM_LOAD_1JCOORD_4ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
82          jx1            = _mm_loadu_ps(ptr1);                                                      \
83      jy1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,1));                            \
84      jz1            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));                            \
85      jx2            = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,3));                            \
86      jy2            = _mm_loadu_ps(ptr1+4);                                                    \
87      jz2            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,1));                            \
88      jx3            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,2));                            \
89      jy3            = _mm_shuffle_ps(jy2,jy2,_MM_SHUFFLE(0,0,0,3));                            \
90      jz3            = _mm_loadu_ps(ptr1+8);                                                    \
91      jx4            = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                            \
92      jy4            = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,2));                            \
93      jz4            = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,3));                            \
94 }
95
96
97 #define GMX_MM_LOAD_2JCOORD_1ATOM_PS(ptr1,ptr2,jx1,jy1,jz1) {  \
98      __m128 _tmp1,_tmp2;                                        \
99          _tmp1           = _mm_load_ss(ptr1);                       \
100      _tmp2           = _mm_load_ss(ptr2);                       \
101          _tmp1           = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));   \
102      _tmp2           = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));   \
103      jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);            \
104      jy1             = _mm_unpackhi_ps(_tmp1,_tmp2);            \
105          jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);            \
106      jz1             = _mm_movehl_ps(jz1,jy1);                  \
107 }
108
109
110 #define GMX_MM_LOAD_2JCOORD_2ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
111      __m128 _tmp1, _tmp2;                                                   \
112          _tmp1          = _mm_loadu_ps(ptr1);                                   \
113      jy1            = _mm_loadu_ps(ptr2);                                   \
114      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));     \
115      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));     \
116      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                           \
117      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                           \
118      jy2            = _mm_unpacklo_ps(jy2,_tmp2);                           \
119      jy1            = _mm_movehl_ps(jx1,jx1);                               \
120      jx2            = _mm_movehl_ps(jz1,jz1);                               \
121      jz2            = _mm_movehl_ps(jy2,jy2);                               \
122 }
123
124
125 #define GMX_MM_LOAD_2JCOORD_3ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
126      __m128 _tmp1, _tmp2, _tmp3;                                                        \
127          _tmp1          = _mm_loadu_ps(ptr1);                                               \
128      jy1            = _mm_loadu_ps(ptr2);                                               \
129      _tmp2          = _mm_loadu_ps(ptr1+4);                                             \
130      jz2            = _mm_loadu_ps(ptr2+4);                                             \
131      jz3            = _mm_load_ss(ptr1+8);                                              \
132      _tmp3          = _mm_load_ss(ptr2+8);                                              \
133      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                       \
134      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                       \
135      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                       \
136      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                       \
137      jy1            = _mm_movehl_ps(jx1,jx1);                                           \
138      jx2            = _mm_movehl_ps(jz1,jz1);                                           \
139      jz2            = _mm_movehl_ps(jy2,jy2);                                           \
140      jy3            = _mm_movehl_ps(jx3,jx3);                                           \
141      jz3            = _mm_unpacklo_ps(jz3,_tmp3);                                       \
142 }
143
144
145 #define GMX_MM_LOAD_2JCOORD_4ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
146      __m128 _tmp1, _tmp2, _tmp3,_tmp4;                                                  \
147          _tmp1          = _mm_loadu_ps(ptr1);                                               \
148      jy1            = _mm_loadu_ps(ptr2);                                               \
149      _tmp2          = _mm_loadu_ps(ptr1+4);                                             \
150      jz2            = _mm_loadu_ps(ptr2+4);                                             \
151      _tmp3          = _mm_loadu_ps(ptr1+8);                                             \
152      _tmp4          = _mm_loadu_ps(ptr2+8);                                             \
153      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                       \
154      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                       \
155      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                       \
156      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                       \
157      jz3            = _mm_unpacklo_ps(_tmp3,_tmp4);                                     \
158      jy4            = _mm_unpackhi_ps(_tmp3,_tmp4);                                     \
159      jy1            = _mm_movehl_ps(jx1,jx1);                                           \
160      jx2            = _mm_movehl_ps(jz1,jz1);                                           \
161      jz2            = _mm_movehl_ps(jy2,jy2);                                           \
162      jy3            = _mm_movehl_ps(jx3,jx3);                                           \
163      jx4            = _mm_movehl_ps(jz3,jz3);                                           \
164      jz4            = _mm_movehl_ps(jy4,jy4);                                           \
165 }
166
167
168 #define GMX_MM_LOAD_3JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
169      __m128 _tmp1,_tmp3,_tmp4;                                      \
170          jx1            = _mm_load_ss(ptr1);                            \
171      jy1            = _mm_load_ss(ptr2);                            \
172      jz1            = _mm_load_ss(ptr3);                            \
173          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));          \
174      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1));          \
175      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1));          \
176      _tmp1          = _mm_unpacklo_ps(jx1,jy1);                     \
177      _tmp3          = _mm_unpackhi_ps(jx1,jy1);                     \
178      _tmp4          = _mm_unpackhi_ps(jz1,jz1);                     \
179      jx1            = _mm_movelh_ps(_tmp1,jz1);                     \
180      jy1            = _mm_movelh_ps(_tmp3,_tmp4);                   \
181      jz1            = _mm_movehl_ps(_tmp4,_tmp3);                   \
182 }
183
184
185 #define GMX_MM_LOAD_3JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
186      __m128 _tmp1, _tmp2;                                                        \
187          jx1            = _mm_loadu_ps(ptr1);                                        \
188      jy1            = _mm_loadu_ps(ptr2);                                        \
189      jz1            = _mm_loadu_ps(ptr3);                                        \
190      jx2            = _mm_setzero_ps();                                          \
191      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                         \
192      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));          \
193      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));          \
194      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));          \
195      _tmp2          = _mm_unpacklo_ps(jy2,jz2);                                  \
196      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp1);                              \
197      jy2            = _mm_movelh_ps(_tmp2,_tmp1);                                \
198      jz2            = _mm_movehl_ps(_tmp1,_tmp2);                                \
199 }
200
201
202 #define GMX_MM_LOAD_3JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {  \
203      __m128 _tmp1, _tmp2;                                                                     \
204          jx1            = _mm_loadu_ps(ptr1);                                                     \
205      jy1            = _mm_loadu_ps(ptr2);                                                     \
206      jz1            = _mm_loadu_ps(ptr3);                                                     \
207      jx2            = _mm_setzero_ps();                                                       \
208      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                      \
209      jy2            = _mm_loadu_ps(ptr1+4);                                                   \
210      jz2            = _mm_loadu_ps(ptr2+4);                                                   \
211      jx3            = _mm_loadu_ps(ptr3+4);                                                   \
212      jy3            = _mm_setzero_ps();                                                       \
213      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                      \
214      jz3            = _mm_load_ss(ptr1+8);                                                    \
215      _tmp1          = _mm_load_ss(ptr2+8);                                                    \
216      _tmp2          = _mm_load_ss(ptr3+8);                                                    \
217      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                             \
218      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp1);                                           \
219      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                             \
220 }
221
222
223 #define GMX_MM_LOAD_3JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
224          jx1            = _mm_loadu_ps(ptr1);                                                     \
225      jy1            = _mm_loadu_ps(ptr2);                                                     \
226      jz1            = _mm_loadu_ps(ptr3);                                                     \
227      jx2            = _mm_setzero_ps();                                                       \
228      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                      \
229      jy2            = _mm_loadu_ps(ptr1+4);                                                   \
230      jz2            = _mm_loadu_ps(ptr2+4);                                                   \
231      jx3            = _mm_loadu_ps(ptr3+4);                                                   \
232      jy3            = _mm_setzero_ps();                                                       \
233      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                      \
234      jz3            = _mm_loadu_ps(ptr1+8);                                                   \
235      jx4            = _mm_loadu_ps(ptr2+8);                                                   \
236      jy4            = _mm_loadu_ps(ptr3+8);                                                   \
237      jz4            = _mm_setzero_ps();                                                       \
238      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                      \
239 }
240
241
242
243 #define GMX_MM_LOAD_4JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
244      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                               \
245          jx1            = _mm_load_ss(ptr1);                                 \
246      _tmp1          = _mm_load_ss(ptr2);                                 \
247      jy1            = _mm_load_ss(ptr3);                                 \
248      jz1            = _mm_load_ss(ptr4);                                 \
249          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));               \
250      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1));             \
251      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1));               \
252      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1));               \
253      _tmp2          = _mm_unpacklo_ps(jx1,_tmp1);                        \
254      _tmp3          = _mm_unpacklo_ps(jy1,jz1);                          \
255      _tmp4          = _mm_unpackhi_ps(jx1,_tmp1);                        \
256      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                          \
257      jx1            = _mm_movelh_ps(_tmp2,_tmp3);                        \
258      jy1            = _mm_movelh_ps(_tmp4,_tmp5);                        \
259      jz1            = _mm_movehl_ps(_tmp5,_tmp4);                        \
260 }
261
262
263 #define GMX_MM_LOAD_4JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
264      __m128 _tmp1, _tmp2;                                                             \
265          jx1            = _mm_loadu_ps(ptr1);                                             \
266      jy1            = _mm_loadu_ps(ptr2);                                             \
267      jz1            = _mm_loadu_ps(ptr3);                                             \
268      jx2            = _mm_loadu_ps(ptr4);                                             \
269      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                              \
270      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));               \
271      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));               \
272      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));               \
273      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4));               \
274      _tmp1          = _mm_unpacklo_ps(jy2,_tmp1);                                     \
275      _tmp2          = _mm_unpacklo_ps(jz2,_tmp2);                                     \
276      jy2            = _mm_unpacklo_ps(_tmp1,_tmp2);                                   \
277      jz2            = _mm_unpackhi_ps(_tmp1,_tmp2);                                   \
278 }
279
280
281 #define GMX_MM_LOAD_4JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
282      __m128 _tmp1, _tmp2, _tmp3;                                                              \
283          jx1            = _mm_loadu_ps(ptr1);                                                     \
284      jy1            = _mm_loadu_ps(ptr2);                                                     \
285      jz1            = _mm_loadu_ps(ptr3);                                                     \
286      jx2            = _mm_loadu_ps(ptr4);                                                     \
287      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                      \
288      jy2            = _mm_loadu_ps(ptr1+4);                                                   \
289      jz2            = _mm_loadu_ps(ptr2+4);                                                   \
290      jx3            = _mm_loadu_ps(ptr3+4);                                                   \
291      jy3            = _mm_loadu_ps(ptr4+4);                                                   \
292      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                      \
293      jz3            = _mm_load_ss(ptr1+8);                                                    \
294      _tmp1          = _mm_load_ss(ptr2+8);                                                    \
295      _tmp2          = _mm_load_ss(ptr3+8);                                                    \
296      _tmp3          = _mm_load_ss(ptr4+8);                                                    \
297      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                             \
298      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp3);                                           \
299      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                             \
300 }
301
302
303 #define GMX_MM_LOAD_4JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
304          jx1            = _mm_loadu_ps(ptr1);                                                     \
305      jy1            = _mm_loadu_ps(ptr2);                                                     \
306      jz1            = _mm_loadu_ps(ptr3);                                                     \
307      jx2            = _mm_loadu_ps(ptr4);                                                     \
308      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                      \
309      jy2            = _mm_loadu_ps(ptr1+4);                                                   \
310      jz2            = _mm_loadu_ps(ptr2+4);                                                   \
311      jx3            = _mm_loadu_ps(ptr3+4);                                                   \
312      jy3            = _mm_loadu_ps(ptr4+4);                                                   \
313      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                      \
314      jz3            = _mm_loadu_ps(ptr1+8);                                                   \
315      jx4            = _mm_loadu_ps(ptr2+8);                                                   \
316      jy4            = _mm_loadu_ps(ptr3+8);                                                   \
317      jz4            = _mm_loadu_ps(ptr4+8);                                                   \
318      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                      \
319 }
320
321
322 #define GMX_MM_UPDATE_1JCOORD_1ATOM_PS(ptr1,jx1,jy1,jz1) {           \
323      __m128 _tmp1;                                                    \
324      jy1            = _mm_unpacklo_ps(jy1,jz1);                       \
325      jx1            = _mm_movelh_ps(jx1,jy1);                         \
326      _tmp1          = _mm_load_ss(ptr1);                              \
327      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
328      _tmp1          = _mm_add_ps(_tmp1,jx1);                          \
329      _mm_store_ss(ptr1,_tmp1);                                        \
330      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
331 }
332
333
334 #define GMX_MM_UPDATE_1JCOORD_2ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
335      __m128 _tmp1, _tmp2;                                               \
336      _tmp1          = _mm_loadu_ps(ptr1);                               \
337      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4)); \
338      jx1            = _mm_unpacklo_ps(jx1,jy1);                         \
339      jz1            = _mm_unpacklo_ps(jz1,jx2);                         \
340      jy2            = _mm_unpacklo_ps(jy2,jz2);                         \
341      jx1            = _mm_movelh_ps(jx1,jz1);                           \
342      _tmp1          = _mm_add_ps(_tmp1,jx1);                            \
343      _tmp2          = _mm_add_ps(_tmp2,jy2);                            \
344      _mm_storeu_ps(ptr1,_tmp1);                                         \
345      _mm_storel_pi((__m64 *)(ptr1+4),_tmp2);                            \
346 }
347
348
349 #define GMX_MM_UPDATE_1JCOORD_3ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
350      __m128 _tmp1, _tmp2, _tmp3;                                        \
351      _tmp1          = _mm_loadu_ps(ptr1);                               \
352      _tmp2          = _mm_loadu_ps(ptr1+4);                             \
353      _tmp3          = _mm_load_ss(ptr1+8);                              \
354      jx1            = _mm_unpacklo_ps(jx1,jy1);                         \
355      jz1            = _mm_unpacklo_ps(jz1,jx2);                         \
356      jy2            = _mm_unpacklo_ps(jy2,jz2);                         \
357      jx3            = _mm_unpacklo_ps(jx3,jy3);                         \
358      jx1            = _mm_movelh_ps(jx1,jz1);                           \
359      jy2            = _mm_movelh_ps(jy2,jx3);                           \
360      _tmp1           = _mm_add_ps(_tmp1,jx1);                           \
361      _tmp2           = _mm_add_ps(_tmp2,jy2);                           \
362      _tmp3           = _mm_add_ss(_tmp3,jz3);                           \
363      _mm_storeu_ps(ptr1,_tmp1);                                         \
364      _mm_storeu_ps(ptr1+4,_tmp2);                                       \
365      _mm_store_ss(ptr1+8,_tmp3);                                        \
366 }
367
368
369 #define GMX_MM_UPDATE_1JCOORD_4ATOMS_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
370      __m128 _tmp1, _tmp2, _tmp3;                                        \
371      _tmp1          = _mm_loadu_ps(ptr1);                               \
372      _tmp2          = _mm_loadu_ps(ptr1+4);                             \
373      _tmp3          = _mm_loadu_ps(ptr1+8);                             \
374      jx1            = _mm_unpacklo_ps(jx1,jy1);                         \
375      jz1            = _mm_unpacklo_ps(jz1,jx2);                         \
376      jy2            = _mm_unpacklo_ps(jy2,jz2);                         \
377      jx3            = _mm_unpacklo_ps(jx3,jy3);                         \
378      jz3            = _mm_unpacklo_ps(jz3,jx4);                         \
379      jy4            = _mm_unpacklo_ps(jy4,jz4);                         \
380      jx1            = _mm_movelh_ps(jx1,jz1);                           \
381      jy2            = _mm_movelh_ps(jy2,jx3);                           \
382      jz3            = _mm_movelh_ps(jz3,jy4);                           \
383      _tmp1          = _mm_add_ps(_tmp1,jx1);                            \
384      _tmp2          = _mm_add_ps(_tmp2,jy2);                            \
385      _tmp3          = _mm_add_ps(_tmp3,jz3);                            \
386      _mm_storeu_ps(ptr1,_tmp1);                                         \
387      _mm_storeu_ps(ptr1+4,_tmp2);                                       \
388      _mm_storeu_ps(ptr1+8,_tmp3);                                       \
389 }
390
391
392 #define GMX_MM_UPDATE_2JCOORD_1ATOM_PS(ptr1,ptr2,jx1,jy1,jz1) {        \
393      __m128 _tmp1,_tmp2,_tmp3,_tmp4;                                    \
394      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));   \
395      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2));              \
396      _tmp2          = _mm_load_ss(ptr1+2);                              \
397      _tmp3          = _mm_load_ss(ptr2+2);                              \
398      jx1            = _mm_unpacklo_ps(jx1,jy1);                         \
399      _tmp4          = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1));     \
400      _tmp1          = _mm_add_ps(_tmp1,jx1);                            \
401      _mm_storel_pi((__m64 *)(ptr1),_tmp1);                              \
402      _mm_storeh_pi((__m64 *)(ptr2),_tmp1);                              \
403      _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1));                        \
404          _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4));                      \
405 }
406
407
408 #define GMX_MM_UPDATE_2JCOORD_2ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) {  \
409      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                     \
410      _tmp1          = _mm_loadu_ps(ptr1);                                      \
411      _tmp2          = _mm_loadu_ps(ptr2);                                      \
412      _tmp3          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));        \
413      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4));                   \
414      jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
415      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
416      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
417      _tmp4          = _mm_movelh_ps(jx1,jz1);                                  \
418      _tmp5          = _mm_movehl_ps(jz1,jx1);                                  \
419      _tmp1          = _mm_add_ps(_tmp1,_tmp4);                                 \
420      _tmp2          = _mm_add_ps(_tmp2,_tmp5);                                 \
421      _tmp3          = _mm_add_ps(_tmp3,jy2);                                   \
422      _mm_storeu_ps(ptr1,_tmp1);                                                \
423      _mm_storeu_ps(ptr2,_tmp2);                                                \
424      _mm_storel_pi((__m64 *)(ptr1+4),_tmp3);                                   \
425          _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3);                                   \
426 }
427
428
429 #define GMX_MM_UPDATE_2JCOORD_3ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
430      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;         \
431      _tmp1          = _mm_loadu_ps(ptr1);                                      \
432      _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
433      _tmp3          = _mm_load_ss(ptr1+8);                                     \
434      _tmp4          = _mm_loadu_ps(ptr2);                                      \
435      _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
436      _tmp6          = _mm_load_ss(ptr2+8);                                     \
437      jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
438      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
439      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
440      jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
441      _tmp7          = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));            \
442      _tmp8          = _mm_movelh_ps(jx1,jz1);                                  \
443      _tmp9          = _mm_movehl_ps(jz1,jx1);                                  \
444      _tmp10         = _mm_movelh_ps(jy2,jx3);                                  \
445      _tmp11         = _mm_movehl_ps(jx3,jy2);                                  \
446      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                 \
447      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                \
448      _tmp3          = _mm_add_ss(_tmp3,jz3);                                   \
449      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                 \
450      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                \
451      _tmp6          = _mm_add_ss(_tmp6,_tmp7);                                 \
452      _mm_storeu_ps(ptr1,_tmp1);                                                \
453      _mm_storeu_ps(ptr1+4,_tmp2);                                              \
454      _mm_store_ss(ptr1+8,_tmp3);                                               \
455      _mm_storeu_ps(ptr2,_tmp4);                                                \
456      _mm_storeu_ps(ptr2+4,_tmp5);                                              \
457      _mm_store_ss(ptr2+8,_tmp6);                                               \
458 }
459
460
461 #define GMX_MM_UPDATE_2JCOORD_4ATOMS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
462      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
463      _tmp1          = _mm_loadu_ps(ptr1);                                       \
464      _tmp2          = _mm_loadu_ps(ptr1+4);                                     \
465      _tmp3          = _mm_loadu_ps(ptr1+8);                                     \
466      _tmp4          = _mm_loadu_ps(ptr2);                                       \
467      _tmp5          = _mm_loadu_ps(ptr2+4);                                     \
468      _tmp6          = _mm_loadu_ps(ptr2+8);                                     \
469      jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
470      jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
471      jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
472      jx3            = _mm_unpacklo_ps(jx3,jy3);                                 \
473      jz3            = _mm_unpacklo_ps(jz3,jx4);                                 \
474      jy4            = _mm_unpacklo_ps(jy4,jz4);                                 \
475      _tmp8          = _mm_movelh_ps(jx1,jz1);                                   \
476      _tmp9          = _mm_movehl_ps(jz1,jx1);                                   \
477      _tmp10         = _mm_movelh_ps(jy2,jx3);                                   \
478      _tmp11         = _mm_movehl_ps(jx3,jy2);                                   \
479      _tmp12         = _mm_movelh_ps(jz3,jy4);                                   \
480      _tmp13         = _mm_movehl_ps(jy4,jz3);                                   \
481      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                  \
482      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                 \
483      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                 \
484      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                  \
485      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                 \
486      _tmp6          = _mm_add_ps(_tmp6,_tmp13);                                 \
487      _mm_storeu_ps(ptr1,_tmp1);                                                 \
488      _mm_storeu_ps(ptr1+4,_tmp2);                                               \
489      _mm_storeu_ps(ptr1+8,_tmp3);                                               \
490      _mm_storeu_ps(ptr2,_tmp4);                                                 \
491      _mm_storeu_ps(ptr2+4,_tmp5);                                               \
492      _mm_storeu_ps(ptr2+8,_tmp6);                                               \
493 }
494
495
496 #define GMX_MM_UPDATE_3JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
497      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7;                \
498      _tmp1          = _mm_load_ss(ptr1);                              \
499      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
500      _tmp2          = _mm_load_ss(ptr2);                              \
501      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));          \
502      _tmp3          = _mm_load_ss(ptr3);                              \
503      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));          \
504      _tmp4          = _mm_unpacklo_ps(jy1,jz1);                       \
505      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                       \
506      _tmp6          = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1)); \
507      _tmp7          = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));   \
508      jx1            = _mm_movelh_ps(jx1,_tmp4);                       \
509      _tmp7          = _mm_movelh_ps(_tmp7,_tmp5);                     \
510      _tmp1          = _mm_add_ps(_tmp1,jx1);                          \
511      _tmp2          = _mm_add_ps(_tmp2,_tmp6);                        \
512      _tmp3          = _mm_add_ps(_tmp3,_tmp7);                        \
513      _mm_store_ss(ptr1,_tmp1);                                        \
514      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
515      _mm_store_ss(ptr2,_tmp2);                                        \
516      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                          \
517      _mm_store_ss(ptr3,_tmp3);                                        \
518      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                          \
519 }
520
521
522 #define GMX_MM_UPDATE_3JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
523      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;      \
524      _tmp1          = _mm_loadu_ps(ptr1);                                      \
525      _tmp2          = _mm_loadu_ps(ptr2);                                      \
526      _tmp3          = _mm_loadu_ps(ptr3);                                      \
527      _tmp4          = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4));                   \
528      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4));                   \
529      _tmp5          = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4));                   \
530      _tmp6          = _mm_unpackhi_ps(jx1,jy1);                                \
531          jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
532      _tmp7          = _mm_unpackhi_ps(jz1,jx2);                                \
533      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
534      _tmp8          = _mm_unpackhi_ps(jy2,jz2);                                \
535      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
536      _tmp9          = _mm_movelh_ps(jx1,jz1);                                  \
537      _tmp10         = _mm_movehl_ps(jz1,jx1);                                  \
538      _tmp6          = _mm_movelh_ps(_tmp6,_tmp7);                              \
539      _tmp1          = _mm_add_ps(_tmp1,_tmp9);                                 \
540      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                \
541      _tmp3          = _mm_add_ps(_tmp3,_tmp6);                                 \
542      _tmp4          = _mm_add_ps(_tmp4,jy2);                                   \
543      _tmp5          = _mm_add_ps(_tmp5,_tmp8);                                 \
544      _mm_storeu_ps(ptr1,_tmp1);                                                \
545      _mm_storeu_ps(ptr2,_tmp2);                                                \
546      _mm_storeu_ps(ptr3,_tmp3);                                                \
547      _mm_storel_pi((__m64 *)(ptr1+4),_tmp4);                                   \
548      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4);                                   \
549          _mm_storel_pi((__m64 *)(ptr3+4),_tmp5);                                   \
550 }
551
552
553 #define GMX_MM_UPDATE_3JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
554      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;      \
555      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;    \
556      _tmp1          = _mm_loadu_ps(ptr1);                                      \
557      _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
558      _tmp3          = _mm_load_ss(ptr1+8);                                     \
559      _tmp4          = _mm_loadu_ps(ptr2);                                      \
560      _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
561      _tmp6          = _mm_load_ss(ptr2+8);                                     \
562      _tmp7          = _mm_loadu_ps(ptr3);                                      \
563      _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
564      _tmp9          = _mm_load_ss(ptr3+8);                                     \
565      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
566      jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
567      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
568      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
569      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
570      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
571      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
572      jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
573      _tmp14         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));            \
574      _tmp15         = _mm_movehl_ps(jz3,jz3);                                  \
575      _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
576      _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
577      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
578      _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
579      _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
580      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
581      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                \
582      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                \
583      _tmp3          = _mm_add_ss(_tmp3,jz3);                                   \
584      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                \
585      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                \
586      _tmp6          = _mm_add_ss(_tmp6,_tmp14);                                \
587      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                \
588      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                \
589      _tmp9          = _mm_add_ss(_tmp9,_tmp15);                                \
590      _mm_storeu_ps(ptr1,_tmp1);                                                \
591      _mm_storeu_ps(ptr1+4,_tmp2);                                              \
592      _mm_store_ss(ptr1+8,_tmp3);                                               \
593      _mm_storeu_ps(ptr2,_tmp4);                                                \
594      _mm_storeu_ps(ptr2+4,_tmp5);                                              \
595      _mm_store_ss(ptr2+8,_tmp6);                                               \
596      _mm_storeu_ps(ptr3,_tmp7);                                                \
597      _mm_storeu_ps(ptr3+4,_tmp8);                                              \
598      _mm_store_ss(ptr3+8,_tmp9);                                               \
599 }
600
601
602 #define GMX_MM_UPDATE_3JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
603      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;         \
604      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21;      \
605      _tmp1          = _mm_loadu_ps(ptr1);                                      \
606      _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
607      _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
608      _tmp4          = _mm_loadu_ps(ptr2);                                      \
609      _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
610      _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
611      _tmp7          = _mm_loadu_ps(ptr3);                                      \
612      _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
613      _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
614      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
615      jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
616      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
617      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
618      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
619      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
620      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
621      jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
622      _tmp14         = _mm_unpackhi_ps(jz3,jx4);                                \
623      jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
624      _tmp15         = _mm_unpackhi_ps(jy4,jz4);                                \
625      jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
626      _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
627      _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
628      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
629      _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
630      _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
631      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
632      _tmp20         = _mm_movelh_ps(jz3,jy4);                                  \
633      _tmp21         = _mm_movehl_ps(jy4,jz3);                                  \
634      _tmp14         = _mm_movelh_ps(_tmp14,_tmp15);                            \
635      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                \
636      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                \
637      _tmp3          = _mm_add_ps(_tmp3,_tmp20);                                \
638      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                \
639      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                \
640      _tmp6          = _mm_add_ps(_tmp6,_tmp21);                                \
641      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                \
642      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                \
643      _tmp9          = _mm_add_ps(_tmp9,_tmp14);                                \
644      _mm_storeu_ps(ptr1,_tmp1);                                                \
645      _mm_storeu_ps(ptr1+4,_tmp2);                                              \
646      _mm_storeu_ps(ptr1+8,_tmp3);                                              \
647      _mm_storeu_ps(ptr2,_tmp4);                                                \
648      _mm_storeu_ps(ptr2+4,_tmp5);                                              \
649      _mm_storeu_ps(ptr2+8,_tmp6);                                              \
650      _mm_storeu_ps(ptr3,_tmp7);                                                \
651      _mm_storeu_ps(ptr3+4,_tmp8);                                              \
652      _mm_storeu_ps(ptr3+8,_tmp9);                                              \
653 }
654
655
656
657 #define GMX_MM_UPDATE_4JCOORD_1ATOM_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
658      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;      \
659      _tmp1          = _mm_load_ss(ptr1);                              \
660      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
661      _tmp2          = _mm_load_ss(ptr2);                              \
662      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));          \
663      _tmp3          = _mm_load_ss(ptr3);                              \
664      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));          \
665      _tmp4          = _mm_load_ss(ptr4);                              \
666      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1));          \
667      _tmp5          = _mm_unpacklo_ps(jy1,jz1);                       \
668      _tmp6          = _mm_unpackhi_ps(jy1,jz1);                       \
669      _tmp7          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
670      _tmp8          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
671      _tmp9          = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
672      _tmp10         = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
673      _tmp1          = _mm_add_ps(_tmp1,_tmp7);                        \
674      _tmp2          = _mm_add_ps(_tmp2,_tmp8);                        \
675      _tmp3          = _mm_add_ps(_tmp3,_tmp9);                        \
676      _tmp4          = _mm_add_ps(_tmp4,_tmp10);                       \
677      _mm_store_ss(ptr1,_tmp1);                                        \
678      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
679      _mm_store_ss(ptr2,_tmp2);                                        \
680      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                          \
681      _mm_store_ss(ptr3,_tmp3);                                        \
682      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                          \
683      _mm_store_ss(ptr4,_tmp4);                                        \
684      _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4);                          \
685 }
686
687
688 #define GMX_MM_UPDATE_4JCOORD_2ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
689      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13;   \
690      _tmp1          = _mm_loadu_ps(ptr1);                                       \
691      _tmp2          = _mm_loadu_ps(ptr2);                                       \
692      _tmp3          = _mm_loadu_ps(ptr3);                                       \
693      _tmp4          = _mm_loadu_ps(ptr4);                                       \
694      _tmp5          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));         \
695      _tmp5          = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4));                    \
696      _tmp6          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));         \
697      _tmp6          = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4));                    \
698      _tmp7          = _mm_unpackhi_ps(jx1,jy1);                                 \
699          jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
700      _tmp8          = _mm_unpackhi_ps(jz1,jx2);                                 \
701      jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
702      _tmp9          = _mm_unpackhi_ps(jy2,jz2);                                 \
703      jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
704      _tmp10         = _mm_movelh_ps(jx1,jz1);                                   \
705      _tmp11         = _mm_movehl_ps(jz1,jx1);                                   \
706      _tmp12         = _mm_movelh_ps(_tmp7,_tmp8);                               \
707      _tmp13         = _mm_movehl_ps(_tmp8,_tmp7);                               \
708      _tmp1          = _mm_add_ps(_tmp1,_tmp10);                                 \
709      _tmp2          = _mm_add_ps(_tmp2,_tmp11);                                 \
710      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                 \
711      _tmp4          = _mm_add_ps(_tmp4,_tmp13);                                 \
712      _tmp5          = _mm_add_ps(_tmp5,jy2);                                    \
713      _tmp6          = _mm_add_ps(_tmp6,_tmp9);                                  \
714      _mm_storeu_ps(ptr1,_tmp1);                                                 \
715      _mm_storeu_ps(ptr2,_tmp2);                                                 \
716      _mm_storeu_ps(ptr3,_tmp3);                                                 \
717      _mm_storeu_ps(ptr4,_tmp4);                                                 \
718      _mm_storel_pi((__m64 *)(ptr1+4),_tmp5);                                    \
719      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5);                                    \
720          _mm_storel_pi((__m64 *)(ptr3+4),_tmp6);                                    \
721          _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6);                                    \
722 }
723
724
725 #define GMX_MM_UPDATE_4JCOORD_3ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
726      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;       \
727      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;     \
728      __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25;                          \
729      _tmp1          = _mm_loadu_ps(ptr1);                                       \
730      _tmp2          = _mm_loadu_ps(ptr1+4);                                     \
731      _tmp3          = _mm_load_ss(ptr1+8);                                      \
732      _tmp4          = _mm_loadu_ps(ptr2);                                       \
733      _tmp5          = _mm_loadu_ps(ptr2+4);                                     \
734      _tmp6          = _mm_load_ss(ptr2+8);                                      \
735      _tmp7          = _mm_loadu_ps(ptr3);                                       \
736      _tmp8          = _mm_loadu_ps(ptr3+4);                                     \
737      _tmp9          = _mm_load_ss(ptr3+8);                                      \
738      _tmp10         = _mm_loadu_ps(ptr4);                                       \
739      _tmp11         = _mm_loadu_ps(ptr4+4);                                     \
740      _tmp12         = _mm_load_ss(ptr4+8);                                      \
741      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                 \
742      jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
743      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                 \
744      jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
745      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                 \
746      jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
747      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                 \
748      jx3            = _mm_unpacklo_ps(jx3,jy3);                                 \
749      _tmp17         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));             \
750      _tmp18         = _mm_movehl_ps(jz3,jz3);                                   \
751      _tmp19         = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1));       \
752      _tmp20         = _mm_movelh_ps(jx1,jz1);                                   \
753      _tmp21         = _mm_movehl_ps(jz1,jx1);                                   \
754      _tmp22         = _mm_movelh_ps(_tmp13,_tmp14);                             \
755      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                             \
756      _tmp23         = _mm_movelh_ps(jy2,jx3);                                   \
757      _tmp24         = _mm_movehl_ps(jx3,jy2);                                   \
758      _tmp25         = _mm_movelh_ps(_tmp15,_tmp16);                             \
759      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                             \
760      _tmp1          = _mm_add_ps(_tmp1,_tmp20);                                 \
761      _tmp2          = _mm_add_ps(_tmp2,_tmp23);                                 \
762      _tmp3          = _mm_add_ss(_tmp3,jz3);                                    \
763      _tmp4          = _mm_add_ps(_tmp4,_tmp21);                                 \
764      _tmp5          = _mm_add_ps(_tmp5,_tmp24);                                 \
765      _tmp6          = _mm_add_ss(_tmp6,_tmp17);                                 \
766      _tmp7          = _mm_add_ps(_tmp7,_tmp22);                                 \
767      _tmp8          = _mm_add_ps(_tmp8,_tmp25);                                 \
768      _tmp9          = _mm_add_ss(_tmp9,_tmp18);                                 \
769      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                                \
770      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                                \
771      _tmp12         = _mm_add_ss(_tmp12,_tmp19);                                \
772      _mm_storeu_ps(ptr1,_tmp1);                                                 \
773      _mm_storeu_ps(ptr1+4,_tmp2);                                               \
774      _mm_store_ss(ptr1+8,_tmp3);                                                \
775      _mm_storeu_ps(ptr2,_tmp4);                                                 \
776      _mm_storeu_ps(ptr2+4,_tmp5);                                               \
777      _mm_store_ss(ptr2+8,_tmp6);                                                \
778      _mm_storeu_ps(ptr3,_tmp7);                                                 \
779      _mm_storeu_ps(ptr3+4,_tmp8);                                               \
780      _mm_store_ss(ptr3+8,_tmp9);                                                \
781      _mm_storeu_ps(ptr4,_tmp10);                                                \
782      _mm_storeu_ps(ptr4+4,_tmp11);                                              \
783      _mm_store_ss(ptr4+8,_tmp12);                                               \
784 }
785
786
787 #define GMX_MM_UPDATE_4JCOORD_4ATOMS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
788      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;         \
789      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
790      __m128 _tmp23,_tmp24;                                                     \
791      _tmp1          = _mm_loadu_ps(ptr1);                                      \
792      _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
793      _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
794      _tmp4          = _mm_loadu_ps(ptr2);                                      \
795      _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
796      _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
797      _tmp7          = _mm_loadu_ps(ptr3);                                      \
798      _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
799      _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
800      _tmp10         = _mm_loadu_ps(ptr4);                                      \
801      _tmp11         = _mm_loadu_ps(ptr4+4);                                    \
802      _tmp12         = _mm_loadu_ps(ptr4+8);                                    \
803      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                \
804      jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
805      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                \
806      jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
807      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                \
808      jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
809      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                \
810      jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
811      _tmp17         = _mm_unpackhi_ps(jz3,jx4);                                \
812      jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
813      _tmp18         = _mm_unpackhi_ps(jy4,jz4);                                \
814      jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
815      _tmp19         = _mm_movelh_ps(jx1,jz1);                                  \
816      jz1            = _mm_movehl_ps(jz1,jx1);                                  \
817      _tmp20         = _mm_movelh_ps(_tmp13,_tmp14);                            \
818      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                            \
819      _tmp21         = _mm_movelh_ps(jy2,jx3);                                  \
820      jx3            = _mm_movehl_ps(jx3,jy2);                                  \
821      _tmp22         = _mm_movelh_ps(_tmp15,_tmp16);                            \
822      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                            \
823      _tmp23         = _mm_movelh_ps(jz3,jy4);                                  \
824      jy4            = _mm_movehl_ps(jy4,jz3);                                  \
825      _tmp24         = _mm_movelh_ps(_tmp17,_tmp18);                            \
826      _tmp18         = _mm_movehl_ps(_tmp18,_tmp17);                            \
827      _tmp1          = _mm_add_ps(_tmp1,_tmp19);                                \
828      _tmp2          = _mm_add_ps(_tmp2,_tmp21);                                \
829      _tmp3          = _mm_add_ps(_tmp3,_tmp23);                                \
830      _tmp4          = _mm_add_ps(_tmp4,jz1);                                   \
831      _tmp5          = _mm_add_ps(_tmp5,jx3);                                   \
832      _tmp6          = _mm_add_ps(_tmp6,jy4);                                   \
833      _tmp7          = _mm_add_ps(_tmp7,_tmp20);                                \
834      _tmp8          = _mm_add_ps(_tmp8,_tmp22);                                \
835      _tmp9          = _mm_add_ps(_tmp9,_tmp24);                                \
836      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                               \
837      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                               \
838      _tmp12         = _mm_add_ps(_tmp12,_tmp18);                               \
839      _mm_storeu_ps(ptr1,_tmp1);                                                \
840      _mm_storeu_ps(ptr1+4,_tmp2);                                              \
841      _mm_storeu_ps(ptr1+8,_tmp3);                                              \
842      _mm_storeu_ps(ptr2,_tmp4);                                                \
843      _mm_storeu_ps(ptr2+4,_tmp5);                                              \
844      _mm_storeu_ps(ptr2+8,_tmp6);                                              \
845      _mm_storeu_ps(ptr3,_tmp7);                                                \
846      _mm_storeu_ps(ptr3+4,_tmp8);                                              \
847      _mm_storeu_ps(ptr3+8,_tmp9);                                              \
848      _mm_storeu_ps(ptr4,_tmp10);                                               \
849      _mm_storeu_ps(ptr4+4,_tmp11);                                             \
850      _mm_storeu_ps(ptr4+8,_tmp12);                                             \
851 }
852
853
854
855 static inline __m128
856 gmx_mm_scalarprod_ps(__m128 x, __m128 y, __m128 z)
857 {
858         return _mm_add_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y)),_mm_mul_ps(z,z));
859 }
860
861
862 static inline __m128
863 gmx_mm_recip_ps(__m128 x)
864 {
865         const __m128 two  = {2.0,2.0,2.0,2.0};
866         
867         __m128 lu = _mm_rcp_ps(x);
868         return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(x,lu)));
869 }
870
871
872
873 static inline __m128
874 gmx_mm_invsqrt_ps(__m128 x)
875 {
876         const __m128 half  = {0.5,0.5,0.5,0.5};
877         const __m128 three = {3.0,3.0,3.0,3.0};
878
879         __m128 lu = _mm_rsqrt_ps(x);
880         
881         return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
882 }
883
884
885 /* Routine to be called with rswitch/rcut at the beginning of a kernel
886  * to set up the 7 constants used for analytic 5th order switch calculations.
887  */
888 static inline void
889 gmx_mm_setup_switch5_constants_ps(__m128 rswitch, __m128 rcut,
890                                                                   __m128 *switch_C3, __m128 *switch_C4, __m128 *switch_C5,
891                                                                   __m128 *switch_D2, __m128 *switch_D3, __m128 *switch_D4)
892 {
893         const __m128  cm6  = { -6.0, -6.0, -6.0, -6.0};
894         const __m128 cm10  = {-10.0,-10.0,-10.0,-10.0};
895         const __m128  c15  = { 15.0, 15.0, 15.0, 15.0};
896         const __m128 cm30  = {-30.0,-30.0,-30.0,-30.0};
897         const __m128  c60  = { 60.0, 60.0, 60.0, 60.0};
898
899         __m128 d,dinv,dinv2,dinv3,dinv4,dinv5;
900         
901         d       = _mm_sub_ps(rcut,rswitch);
902         dinv    = gmx_mm_recip_ps(d);
903         dinv2   = _mm_mul_ps(dinv,dinv);
904         dinv3   = _mm_mul_ps(dinv2,dinv);
905         dinv4   = _mm_mul_ps(dinv2,dinv2);
906         dinv5   = _mm_mul_ps(dinv3,dinv2);
907         
908         *switch_C3 = _mm_mul_ps(cm10,dinv3);
909         *switch_C4 = _mm_mul_ps(c15,dinv4);
910         *switch_C5 = _mm_mul_ps(cm6,dinv5);
911         *switch_D2 = _mm_mul_ps(cm30,dinv3);
912         *switch_D3 = _mm_mul_ps(c60,dinv4);
913         *switch_D4 = _mm_mul_ps(cm30,dinv5);
914 }
915
916
917 #define GMX_MM_SET_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) {   \
918     const __m128  _sw_one  = {  1.0,  1.0,  1.0,  1.0};                                      \
919     __m128 d,d2;                                                                             \
920     d     = _mm_max_ps(r,rswitch);                                                           \
921     d     = _mm_min_ps(d,rcut);                                                              \
922     d     = _mm_sub_ps(d,rswitch);                                                           \
923     d2    = _mm_mul_ps(d,d);                                                                 \
924     sw    = _mm_mul_ps(d,sw_C5);                                                             \
925     dsw   = _mm_mul_ps(d,sw_D4);                                                             \
926     sw    = _mm_add_ps(sw,sw_C4);                                                            \
927     dsw   = _mm_add_ps(dsw,sw_D3);                                                           \
928     sw    = _mm_mul_ps(sw,d);                                                                \
929     dsw   = _mm_mul_ps(dsw,d);                                                               \
930     sw    = _mm_add_ps(sw,sw_C3);                                                            \
931     dsw   = _mm_add_ps(dsw,sw_D2);                                                           \
932     sw    = _mm_mul_ps(sw,_mm_mul_ps(d,d2));                                                 \
933     dsw   = _mm_mul_ps(dsw,d2);                                                              \
934     sw    = _mm_add_ps(sw,_sw_one);                                                          \
935 }
936
937
938 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
939 static inline __m128
940 gmx_mm_int_coulomb_ps(__m128 rinv, __m128 qq,__m128 *vctot)
941 {
942         __m128 vcoul = _mm_mul_ps(qq,rinv);
943         *vctot   = _mm_add_ps(*vctot,vcoul);
944         return vcoul;
945 }
946
947
948 static inline void
949 gmx_mm_int_coulomb_noforce_ps(__m128 rinv, __m128 qq,__m128 *vctot)
950 {
951         __m128 vcoul = _mm_mul_ps(qq,rinv);
952         *vctot   = _mm_add_ps(*vctot,vcoul);
953         return;
954 }
955
956 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
957 static inline __m128
958 gmx_mm_int_coulombrf_ps(const __m128 rinv, const __m128 rsq, const __m128 krf, const __m128 crf, const __m128 qq,__m128 *vctot)
959 {
960         const __m128 two  = {2.0,2.0,2.0,2.0};
961         __m128 vcoul,krsq;
962         
963         krsq   = _mm_mul_ps(krf,rsq);
964         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
965         *vctot = _mm_add_ps(*vctot,vcoul);
966         
967         return _mm_mul_ps(qq, _mm_sub_ps(rinv, _mm_mul_ps(two,krsq)));
968 }
969
970
971 static inline void
972 gmx_mm_int_coulombrf_noforce_ps(__m128 rinv, __m128 rsq, __m128 krf, __m128 crf, __m128 qq,__m128 *vctot)
973 {
974         __m128 vcoul,krsq;
975         
976         krsq   = _mm_mul_ps(krf,rsq);
977         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
978         *vctot   = _mm_add_ps(*vctot,vcoul);
979         return;
980 }
981
982
983 /* GB */
984
985
986
987
988 /* GB + RF */
989
990
991 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
992 static inline __m128
993 gmx_mm_int_lj_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
994 {
995         const __m128 six    = {6.0,6.0,6.0,6.0};
996         const __m128 twelve = {12.0,12.0,12.0,12.0};
997         
998         __m128 rinvsix,vvdw6,vvdw12;
999                 
1000         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
1001         vvdw6    = _mm_mul_ps(c6,rinvsix);  
1002         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
1003         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
1004         
1005         return _mm_sub_ps( _mm_mul_ps(twelve,vvdw12),_mm_mul_ps(six,vvdw6));
1006 }
1007                    
1008
1009 static inline void
1010 gmx_mm_int_lj_potonly_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
1011 {
1012         __m128 rinvsix,vvdw6,vvdw12;
1013         
1014         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
1015         vvdw6    = _mm_mul_ps(c6,rinvsix);  
1016         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
1017         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
1018         
1019         return;
1020 }
1021
1022 #define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
1023
1024
1025 /* Return force should be multiplied by -rinv to get fscal */
1026 static inline __m128
1027 gmx_mm_int_4_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1028 {
1029     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
1030         __m128i n0;
1031         int     n_a,n_b,n_c,n_d;
1032         
1033     rt       = _mm_mul_ps(r,tabscale); 
1034         n0       = _mm_cvttps_epi32(rt);
1035         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1036         eps2     = _mm_mul_ps(eps,eps);
1037
1038         /* Extract indices from n0 */
1039         n_a      = gmx_mm_extract_epi32(n0,0);
1040         n_b      = gmx_mm_extract_epi32(n0,1);
1041         n_c      = gmx_mm_extract_epi32(n0,2);
1042         n_d      = gmx_mm_extract_epi32(n0,3);
1043         Y        = _mm_load_ps(VFtab + 4* n_a);
1044         F        = _mm_load_ps(VFtab + 4* n_b);
1045         G        = _mm_load_ps(VFtab + 4* n_c);
1046         H        = _mm_load_ps(VFtab + 4* n_d);
1047         _MM_TRANSPOSE4_PS(Y,F,G,H);
1048         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1049         G        = _mm_mul_ps(G,eps);               /* Geps  */
1050         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
1051         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1052         *vctot   = _mm_add_ps(*vctot,vcoul);
1053         
1054         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1055         
1056         return _mm_mul_ps(F,tabscale);
1057 }
1058
1059
1060
1061 /* Return force should be multiplied by -rinv to get fscal */
1062 static inline __m128
1063 gmx_mm_int_4_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1064 {
1065     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1066         __m128i n0;
1067         int     n_a,n_b,n_c,n_d;
1068         
1069     rt       = _mm_mul_ps(r,tabscale); 
1070         n0       = _mm_cvttps_epi32(rt);
1071         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1072         eps2     = _mm_mul_ps(eps,eps);
1073         
1074         /* Extract indices from n0 */
1075         n_a      = gmx_mm_extract_epi32(n0,0);
1076         n_b      = gmx_mm_extract_epi32(n0,1);
1077         n_c      = gmx_mm_extract_epi32(n0,2);
1078         n_d      = gmx_mm_extract_epi32(n0,3);
1079         
1080         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1081          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1082          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1083          */
1084         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset);
1085         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset);
1086         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset);
1087         Hd       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset);
1088         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset + 4);
1089         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset + 4);
1090         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset + 4);
1091         Hr       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset + 4);
1092         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1093         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1094         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1095         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1096         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1097         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1098         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1099         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1100         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1101         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1102         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1103         
1104         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1105         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1106         
1107         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1108 }
1109
1110
1111 /* Return force should be multiplied by -rinv to get fscal */
1112 static inline __m128
1113 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12, 
1114                                                                   __m128 *vctot, __m128 *vvdwtot)
1115 {
1116     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1117         __m128i n0;
1118         int     n_a,n_b,n_c,n_d;
1119         
1120     rt       = _mm_mul_ps(r,tabscale); 
1121         n0       = _mm_cvttps_epi32(rt);
1122         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1123         eps2     = _mm_mul_ps(eps,eps);
1124         
1125         /* Extract indices from n0 */
1126         n_a      = gmx_mm_extract_epi32(n0,0);
1127         n_b      = gmx_mm_extract_epi32(n0,1);
1128         n_c      = gmx_mm_extract_epi32(n0,2);
1129         n_d      = gmx_mm_extract_epi32(n0,3);
1130         
1131         
1132         Yc       = _mm_load_ps(VFtab + 12* n_a);
1133         Fc       = _mm_load_ps(VFtab + 12* n_b);
1134         Gc       = _mm_load_ps(VFtab + 12* n_c);
1135         Hc       = _mm_load_ps(VFtab + 12* n_d);
1136         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
1137         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
1138         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
1139         Hd       = _mm_load_ps(VFtab + 12* n_d + 4);
1140         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
1141         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
1142         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
1143         Hr       = _mm_load_ps(VFtab + 12* n_d + 8);
1144         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1145         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1146         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1147         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
1148         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
1149         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
1150         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1151         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1152         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1153         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1154         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1155         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1156         
1157         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1158         *vctot   = _mm_add_ps(*vctot,vcoul);
1159
1160         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1161         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1162         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1163         
1164         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1165         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1166         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1167         
1168         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1169 }
1170
1171
1172
1173 /* Return force should be multiplied by -rinv to get fscal */
1174 static inline __m128
1175 gmx_mm_int_3_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1176 {
1177     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
1178         __m128i n0;
1179         int     n_a,n_b,n_c;
1180         
1181     rt       = _mm_mul_ps(r,tabscale); 
1182         n0       = _mm_cvttps_epi32(rt);
1183         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1184         eps2     = _mm_mul_ps(eps,eps);
1185         
1186         /* Extract indices from n0 */
1187         n_a      = gmx_mm_extract_epi32(n0,0);
1188         n_b      = gmx_mm_extract_epi32(n0,1);
1189         n_c      = gmx_mm_extract_epi32(n0,2);
1190         Y        = _mm_load_ps(VFtab + 4* n_a);
1191         F        = _mm_load_ps(VFtab + 4* n_b);
1192         G        = _mm_load_ps(VFtab + 4* n_c);
1193         H        = _mm_setzero_ps();
1194         _MM_TRANSPOSE4_PS(Y,F,G,H);
1195         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1196         G        = _mm_mul_ps(G,eps);               /* Geps  */
1197         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
1198         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1199         *vctot   = _mm_add_ps(*vctot,vcoul);
1200         
1201         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1202         
1203         return _mm_mul_ps(F,tabscale);
1204 }
1205
1206
1207
1208 /* Return force should be multiplied by -rinv to get fscal */
1209 static inline __m128
1210 gmx_mm_int_3_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1211 {
1212     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1213         __m128i n0;
1214         int     n_a,n_b,n_c;
1215         
1216     rt       = _mm_mul_ps(r,tabscale); 
1217         n0       = _mm_cvttps_epi32(rt);
1218         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1219         eps2     = _mm_mul_ps(eps,eps);
1220         
1221         /* Extract indices from n0 */
1222         n_a      = gmx_mm_extract_epi32(n0,0);
1223         n_b      = gmx_mm_extract_epi32(n0,1);
1224         n_c      = gmx_mm_extract_epi32(n0,2);
1225         
1226         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1227          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1228          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1229          */
1230         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1231         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
1232         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset);
1233         Hd       = _mm_setzero_ps();
1234         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1235         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
1236         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset + 4);
1237         Hr       = _mm_setzero_ps();
1238         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1239         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1240         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1241         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1242         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1243         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1244         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1245         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1246         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1247         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1248         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1249         
1250         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1251         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1252         
1253         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1254 }
1255
1256
1257 /* Return force should be multiplied by -rinv to get fscal */
1258 static inline __m128
1259 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12, 
1260                                                                   __m128 *vctot, __m128 *vvdwtot)
1261 {
1262     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1263         __m128i n0;
1264         int     n_a,n_b,n_c;
1265         
1266     rt       = _mm_mul_ps(r,tabscale); 
1267         n0       = _mm_cvttps_epi32(rt);
1268         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1269         eps2     = _mm_mul_ps(eps,eps);
1270         
1271         /* Extract indices from n0 */
1272         n_a      = gmx_mm_extract_epi32(n0,0);
1273         n_b      = gmx_mm_extract_epi32(n0,1);
1274         n_c      = gmx_mm_extract_epi32(n0,2);
1275         
1276         
1277         Yc       = _mm_load_ps(VFtab + 12* n_a);
1278         Fc       = _mm_load_ps(VFtab + 12* n_b);
1279         Gc       = _mm_load_ps(VFtab + 12* n_c);
1280         Hc       = _mm_setzero_ps();
1281         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
1282         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
1283         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
1284         Hd       = _mm_setzero_ps();
1285         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
1286         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
1287         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
1288         Hr       = _mm_setzero_ps();
1289         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1290         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1291         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1292         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
1293         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
1294         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
1295         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1296         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1297         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1298         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1299         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1300         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1301         
1302         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1303         *vctot   = _mm_add_ps(*vctot,vcoul);
1304         
1305         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1306         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1307         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1308         
1309         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1310         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1311         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1312         
1313         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1314 }
1315
1316
1317
1318
1319
1320 /* Return force should be multiplied by -rinv to get fscal */
1321 static inline __m128
1322 gmx_mm_int_2_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1323 {
1324     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
1325         __m128i n0;
1326         int     n_a,n_b;
1327         
1328     rt       = _mm_mul_ps(r,tabscale); 
1329         n0       = _mm_cvttps_epi32(rt);
1330         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1331         eps2     = _mm_mul_ps(eps,eps);
1332         
1333         /* Extract indices from n0 */
1334         n_a      = gmx_mm_extract_epi32(n0,0);
1335         n_b      = gmx_mm_extract_epi32(n0,1);
1336         Y        = _mm_load_ps(VFtab + 4* n_a);
1337         F        = _mm_load_ps(VFtab + 4* n_b);
1338         G        = _mm_setzero_ps();
1339         H        = _mm_setzero_ps();
1340         _MM_TRANSPOSE4_PS(Y,F,G,H);
1341         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1342         G        = _mm_mul_ps(G,eps);               /* Geps  */
1343         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
1344         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1345         *vctot   = _mm_add_ps(*vctot,vcoul);
1346         
1347         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1348         
1349         return _mm_mul_ps(F,tabscale);
1350 }
1351
1352
1353
1354 /* Return force should be multiplied by -rinv to get fscal */
1355 static inline __m128
1356 gmx_mm_int_2_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1357 {
1358     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1359         __m128i n0;
1360         int     n_a,n_b;
1361         
1362     rt       = _mm_mul_ps(r,tabscale); 
1363         n0       = _mm_cvttps_epi32(rt);
1364         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1365         eps2     = _mm_mul_ps(eps,eps);
1366         
1367         /* Extract indices from n0 */
1368         n_a      = gmx_mm_extract_epi32(n0,0);
1369         n_b      = gmx_mm_extract_epi32(n0,1);
1370         
1371         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1372          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1373          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1374          */
1375         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1376         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
1377         Gd       = _mm_setzero_ps();
1378         Hd       = _mm_setzero_ps();
1379         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1380         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
1381         Gr       = _mm_setzero_ps();
1382         Hr       = _mm_setzero_ps();
1383         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1384         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1385         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1386         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1387         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1388         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1389         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1390         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1391         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1392         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1393         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1394         
1395         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1396         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1397         
1398         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1399 }
1400
1401
1402 /* Return force should be multiplied by -rinv to get fscal */
1403 static inline __m128
1404 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12, 
1405                                                                   __m128 *vctot, __m128 *vvdwtot)
1406 {
1407     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1408         __m128i n0;
1409         int     n_a,n_b;
1410         
1411     rt       = _mm_mul_ps(r,tabscale); 
1412         n0       = _mm_cvttps_epi32(rt);
1413         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1414         eps2     = _mm_mul_ps(eps,eps);
1415         
1416         /* Extract indices from n0 */
1417         n_a      = gmx_mm_extract_epi32(n0,0);
1418         n_b      = gmx_mm_extract_epi32(n0,1);
1419         
1420         Yc       = _mm_load_ps(VFtab + 12* n_a);
1421         Fc       = _mm_load_ps(VFtab + 12* n_b);
1422         Gc       = _mm_setzero_ps();
1423         Hc       = _mm_setzero_ps();
1424         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
1425         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
1426         Gd       = _mm_setzero_ps();
1427         Hd       = _mm_setzero_ps();
1428         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
1429         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
1430         Gr       = _mm_setzero_ps();
1431         Hr       = _mm_setzero_ps();
1432         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1433         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1434         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1435         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
1436         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
1437         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
1438         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1439         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1440         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1441         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1442         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1443         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1444         
1445         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1446         *vctot   = _mm_add_ps(*vctot,vcoul);
1447         
1448         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1449         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1450         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1451         
1452         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1453         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1454         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1455         
1456         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1457 }
1458
1459
1460
1461
1462 /* Return force should be multiplied by -rinv to get fscal */
1463 static inline __m128
1464 gmx_mm_int_1_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
1465 {
1466     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
1467         __m128i n0;
1468         int     n_a;
1469         
1470     rt       = _mm_mul_ps(r,tabscale); 
1471         n0       = _mm_cvttps_epi32(rt);
1472         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1473         eps2     = _mm_mul_ps(eps,eps);
1474         
1475         /* Extract indices from n0 */
1476         n_a      = gmx_mm_extract_epi32(n0,0);
1477         Y        = _mm_load_ps(VFtab + 4* n_a);
1478         F        = _mm_setzero_ps();
1479         G        = _mm_setzero_ps();
1480         H        = _mm_setzero_ps();
1481         _MM_TRANSPOSE4_PS(Y,F,G,H);
1482         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1483         G        = _mm_mul_ps(G,eps);               /* Geps  */
1484         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
1485         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
1486         *vctot   = _mm_add_ps(*vctot,vcoul);
1487         
1488         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
1489         
1490         return _mm_mul_ps(F,tabscale);
1491 }
1492
1493
1494
1495 /* Return force should be multiplied by -rinv to get fscal */
1496 static inline __m128
1497 gmx_mm_int_1_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
1498 {
1499     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1500         __m128i n0;
1501         int     n_a;
1502         
1503     rt       = _mm_mul_ps(r,tabscale); 
1504         n0       = _mm_cvttps_epi32(rt);
1505         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1506         eps2     = _mm_mul_ps(eps,eps);
1507         
1508         /* Extract indices from n0 */
1509         n_a      = gmx_mm_extract_epi32(n0,0);
1510         
1511         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
1512          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
1513          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
1514          */
1515         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
1516         Fd       = _mm_setzero_ps();
1517         Gd       = _mm_setzero_ps();
1518         Hd       = _mm_setzero_ps();
1519         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
1520         Fr       = _mm_setzero_ps();
1521         Gr       = _mm_setzero_ps();
1522         Hr       = _mm_setzero_ps();
1523         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1524         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1525         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1526         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1527         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1528         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1529         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1530         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1531         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1532         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1533         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1534         
1535         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1536         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1537         
1538         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
1539 }
1540
1541
1542 /* Return force should be multiplied by -rinv to get fscal */
1543 static inline __m128
1544 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12, 
1545                                                                          __m128 *vctot, __m128 *vvdwtot)
1546 {
1547     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
1548         __m128i n0;
1549         int     n_a;
1550         
1551     rt       = _mm_mul_ps(r,tabscale); 
1552         n0       = _mm_cvttps_epi32(rt);
1553         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1554         eps2     = _mm_mul_ps(eps,eps);
1555         
1556         /* Extract indices from n0 */
1557         n_a      = gmx_mm_extract_epi32(n0,0);
1558         
1559         Yc       = _mm_load_ps(VFtab + 12* n_a);
1560         Fc       = _mm_setzero_ps();
1561         Gc       = _mm_setzero_ps();
1562         Hc       = _mm_setzero_ps();
1563         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
1564         Fd       = _mm_setzero_ps();
1565         Gd       = _mm_setzero_ps();
1566         Hd       = _mm_setzero_ps();
1567         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
1568         Fr       = _mm_setzero_ps();
1569         Gr       = _mm_setzero_ps();
1570         Hr       = _mm_setzero_ps();
1571         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
1572         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
1573         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
1574         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
1575         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
1576         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
1577         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
1578         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
1579         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
1580         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
1581         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
1582         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
1583         
1584         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
1585         *vctot   = _mm_add_ps(*vctot,vcoul);
1586         
1587         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
1588         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
1589         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
1590         
1591         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
1592         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
1593         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
1594         
1595         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
1596 }
1597
1598
1599
1600
1601
1602 /* Return force should be multiplied by +rinv to get fscal */
1603 static inline __m128
1604 gmx_mm_int_4_genborn_ps(__m128 r, __m128 isai, 
1605                                                 float * isaj1, float *isaj2, float *isaj3, float *isaj4,
1606                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum, 
1607                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3, float *dvdaj4, 
1608                                                 __m128 *vgbtot)
1609 {
1610         const __m128 half  = {0.5,0.5,0.5,0.5};
1611         
1612     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1613         __m128i n0;
1614         int     n_a,n_b,n_c,n_d;
1615         
1616         /* Assemble isaj */
1617         isaj     = _mm_load_ss(isaj1);
1618         t2       = _mm_load_ss(isaj2);
1619         t3       = _mm_load_ss(isaj3);
1620         t4       = _mm_load_ss(isaj4);
1621         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
1622         t3       = _mm_unpacklo_ps(t3,t4);  /* - - t4 t3 */
1623         isaj     = _mm_movelh_ps(isaj,t3); /* t4 t3 t2 t1 */
1624         
1625         isaprod     = _mm_mul_ps(isai,isaj);
1626         qq          = _mm_mul_ps(qq,isaprod);
1627         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
1628         
1629         rt       = _mm_mul_ps(r,gbtabscale); 
1630         n0       = _mm_cvttps_epi32(rt);
1631         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1632         eps2     = _mm_mul_ps(eps,eps);
1633                 
1634         /* Extract indices from n0 */
1635         n_a      = gmx_mm_extract_epi32(n0,0);
1636         n_b      = gmx_mm_extract_epi32(n0,1);
1637         n_c      = gmx_mm_extract_epi32(n0,2);
1638         n_d      = gmx_mm_extract_epi32(n0,3);
1639         Y        = _mm_load_ps(GBtab + 4* n_a);
1640         F        = _mm_load_ps(GBtab + 4* n_b);
1641         G        = _mm_load_ps(GBtab + 4* n_c);
1642         H        = _mm_load_ps(GBtab + 4* n_d);
1643         _MM_TRANSPOSE4_PS(Y,F,G,H);
1644         G        = _mm_mul_ps(G,eps);               /* Geps  */
1645         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1646         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
1647                 
1648         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1649         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1650
1651         vgb      = _mm_mul_ps(qq, VV);
1652         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1653
1654         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1655         
1656         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1657         
1658         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1659         
1660         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1661
1662         /* Update 4 dada[j] values */
1663         Y        = _mm_load_ss(dvdaj1);
1664         F        = _mm_load_ss(dvdaj2);
1665         G        = _mm_load_ss(dvdaj3);
1666         H        = _mm_load_ss(dvdaj4);
1667         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
1668         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1669         t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,1));
1670                 
1671         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1672         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1673         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
1674         _mm_store_ss( dvdaj4 , _mm_add_ss( H, t4 ) );
1675         
1676         return ftmp;
1677 }
1678
1679
1680
1681 /* Return force should be multiplied by +rinv to get fscal */
1682 static inline __m128
1683 gmx_mm_int_3_genborn_ps(__m128 r, __m128 isai, 
1684                                                 float * isaj1, float *isaj2, float *isaj3, 
1685                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum, 
1686                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3,
1687                                                 __m128 *vgbtot)
1688 {
1689         const __m128 half  = {0.5,0.5,0.5,0.5};
1690         
1691     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1692         __m128i n0;
1693         int     n_a,n_b,n_c,n_d;
1694         
1695         /* Assemble isaj */
1696         isaj     = _mm_load_ss(isaj1);
1697         t2       = _mm_load_ss(isaj2);
1698         t3       = _mm_load_ss(isaj3);
1699         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
1700         t3       = _mm_unpacklo_ps(t3,t3);  /* - - t3 t3 */
1701         isaj     = _mm_movelh_ps(isaj,t3); /* t3 t3 t2 t1 */
1702         
1703         isaprod     = _mm_mul_ps(isai,isaj);
1704         qq          = _mm_mul_ps(qq,isaprod);
1705         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
1706         
1707         rt       = _mm_mul_ps(r,gbtabscale); 
1708         n0       = _mm_cvttps_epi32(rt);
1709         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1710         eps2     = _mm_mul_ps(eps,eps);
1711         
1712         /* Extract indices from n0 */
1713         n_a      = gmx_mm_extract_epi32(n0,0);
1714         n_b      = gmx_mm_extract_epi32(n0,1);
1715         n_c      = gmx_mm_extract_epi32(n0,2);
1716         Y        = _mm_load_ps(GBtab + 4* n_a);
1717         F        = _mm_load_ps(GBtab + 4* n_b);
1718         G        = _mm_load_ps(GBtab + 4* n_c);
1719         H        = _mm_setzero_ps();
1720         _MM_TRANSPOSE4_PS(Y,F,G,H);
1721         G        = _mm_mul_ps(G,eps);               /* Geps  */
1722         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1723         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
1724         
1725         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1726         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1727         
1728         vgb      = _mm_mul_ps(qq, VV);
1729         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1730         
1731         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1732         
1733         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1734         
1735         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1736         
1737         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1738         
1739         /* Update 3 dada[j] values */
1740         Y        = _mm_load_ss(dvdaj1);
1741         F        = _mm_load_ss(dvdaj2);
1742         G        = _mm_load_ss(dvdaj3);
1743         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
1744         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1745         
1746         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1747         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1748         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
1749         
1750         return ftmp;
1751 }
1752
1753
1754
1755
1756 /* Return force should be multiplied by +rinv to get fscal */
1757 static inline __m128
1758 gmx_mm_int_2_genborn_ps(__m128 r, __m128 isai, 
1759                                                 float * isaj1, float *isaj2, 
1760                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum, 
1761                                                 float *dvdaj1, float *dvdaj2,
1762                                                 __m128 *vgbtot)
1763 {
1764         const __m128 half  = {0.5,0.5,0.5,0.5};
1765         
1766     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1767         __m128i n0;
1768         int     n_a,n_b,n_c,n_d;
1769         
1770         /* Assemble isaj */
1771         isaj     = _mm_load_ss(isaj1);
1772         t2       = _mm_load_ss(isaj2);
1773         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
1774         
1775         isaprod     = _mm_mul_ps(isai,isaj);
1776         qq          = _mm_mul_ps(qq,isaprod);
1777         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
1778         
1779         rt       = _mm_mul_ps(r,gbtabscale); 
1780         n0       = _mm_cvttps_epi32(rt);
1781         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1782         eps2     = _mm_mul_ps(eps,eps);
1783         
1784         /* Extract indices from n0 */
1785         n_a      = gmx_mm_extract_epi32(n0,0);
1786         n_b      = gmx_mm_extract_epi32(n0,1);
1787         Y        = _mm_load_ps(GBtab + 4* n_a);
1788         F        = _mm_load_ps(GBtab + 4* n_b);
1789         G        = _mm_setzero_ps();
1790         H        = _mm_setzero_ps();
1791         _MM_TRANSPOSE4_PS(Y,F,G,H);
1792         G        = _mm_mul_ps(G,eps);               /* Geps  */
1793         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1794         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
1795         
1796         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1797         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1798         
1799         vgb      = _mm_mul_ps(qq, VV);
1800         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1801         
1802         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1803         
1804         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1805         
1806         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1807         
1808         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1809         
1810         /* Update 2 dada[j] values */
1811         Y        = _mm_load_ss(dvdaj1);
1812         F        = _mm_load_ss(dvdaj2);
1813         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
1814         
1815         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1816         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
1817         
1818         return ftmp;
1819 }
1820
1821 /* Return force should be multiplied by +rinv to get fscal */
1822 static inline __m128
1823 gmx_mm_int_1_genborn_ps(__m128 r, __m128 isai, 
1824                                                 float * isaj1, 
1825                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum, 
1826                                                 float *dvdaj1, 
1827                                                 __m128 *vgbtot)
1828 {
1829         const __m128 half  = {0.5,0.5,0.5,0.5};
1830         
1831     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
1832         __m128i n0;
1833         int     n_a,n_b,n_c,n_d;
1834         
1835         /* Assemble isaj */
1836         isaj     = _mm_load_ss(isaj1);
1837         
1838         isaprod     = _mm_mul_ps(isai,isaj);
1839         qq          = _mm_mul_ps(qq,isaprod);
1840         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
1841         
1842         rt       = _mm_mul_ps(r,gbtabscale); 
1843         n0       = _mm_cvttps_epi32(rt);
1844         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
1845         eps2     = _mm_mul_ps(eps,eps);
1846         
1847         /* Extract indices from n0 */
1848         n_a      = gmx_mm_extract_epi32(n0,0);
1849         Y        = _mm_load_ps(GBtab + 4* n_a);
1850         F        = _mm_setzero_ps();
1851         G        = _mm_setzero_ps();
1852         H        = _mm_setzero_ps();
1853         _MM_TRANSPOSE4_PS(Y,F,G,H);
1854         G        = _mm_mul_ps(G,eps);               /* Geps  */
1855         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
1856         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
1857         
1858         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
1859         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
1860         
1861         vgb      = _mm_mul_ps(qq, VV);
1862         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
1863         
1864         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
1865         
1866         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
1867         
1868         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
1869         
1870         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
1871         
1872         /* Update 1 dada[j] values */
1873         Y        = _mm_load_ss(dvdaj1);
1874         
1875         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
1876         
1877         return ftmp;
1878 }
1879
1880
1881
1882
1883
1884 static inline void
1885 gmx_mm_update_iforce_1atom_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1886                                                           float *fptr,
1887                                                           float *fshiftptr)
1888 {
1889         __m128 t1,t2,t3;
1890         
1891         /* transpose data */
1892         t1 = fix1;
1893         _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);  
1894         fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
1895
1896         t2 = _mm_load_ss(fptr);
1897         t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
1898         t3 = _mm_load_ss(fshiftptr);
1899         t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
1900         
1901         t2 = _mm_add_ps(t2,fix1);
1902         t3 = _mm_add_ps(t3,fix1);
1903         
1904         _mm_store_ss(fptr,t2);
1905         _mm_storeh_pi((__m64 *)(fptr+1),t2);
1906         _mm_store_ss(fshiftptr,t3);
1907         _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
1908 }
1909
1910 static inline void
1911 gmx_mm_update_iforce_2atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1912                                                            __m128 fix2, __m128 fiy2, __m128 fiz2,
1913                                                            float *fptr,
1914                                                            float *fshiftptr)
1915 {
1916         __m128 t1,t2,t3,t4;
1917         
1918         /* transpose data */
1919         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);  
1920         t1 = _mm_unpacklo_ps(fiy2,fiz2);
1921         t2 = _mm_unpackhi_ps(fiy2,fiz2);
1922                 
1923         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
1924         t1   = _mm_add_ps(t1,t2);
1925         t2   = _mm_movehl_ps(t2,t1);
1926         fiy2 = _mm_add_ps(t1,t2);
1927
1928         _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
1929         t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
1930         _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
1931         
1932         t4 = _mm_load_ss(fshiftptr+2);
1933         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
1934         
1935         t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
1936         t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
1937         t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
1938
1939         t1 = _mm_add_ps(t1,t2);
1940         t3 = _mm_add_ps(t3,t4);
1941         t1 = _mm_add_ps(t1,t3); /* y x - z */
1942         
1943         _mm_store_ss(fshiftptr+2,t1);
1944         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
1945 }
1946
1947
1948
1949 static inline void
1950 gmx_mm_update_iforce_3atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1951                                                            __m128 fix2, __m128 fiy2, __m128 fiz2,
1952                                                            __m128 fix3, __m128 fiy3, __m128 fiz3,
1953                                                            float *fptr,
1954                                                            float *fshiftptr)
1955 {
1956         __m128 t1,t2,t3,t4;
1957         
1958         /* transpose data */
1959         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);  
1960         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
1961         t2   = _mm_movehl_ps(t2,fiz3);
1962         t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
1963         t3   = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
1964         
1965         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
1966         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
1967         fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1)  , _mm_add_ps(t2,t3));
1968
1969         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
1970         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
1971         _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
1972         
1973         t4 = _mm_load_ss(fshiftptr+2);
1974         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
1975         
1976         t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
1977         t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
1978         t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
1979         t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
1980
1981         t1 = _mm_add_ps(t1,t2);
1982         t3 = _mm_add_ps(t3,t4);
1983         t1 = _mm_add_ps(t1,t3); /* y x - z */
1984         
1985         _mm_store_ss(fshiftptr+2,t1);
1986         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
1987 }
1988
1989 static inline void
1990 gmx_mm_update_iforce_4atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
1991                                                            __m128 fix2, __m128 fiy2, __m128 fiz2,
1992                                                            __m128 fix3, __m128 fiy3, __m128 fiz3,
1993                                                            __m128 fix4, __m128 fiy4, __m128 fiz4,
1994                                                            float *fptr,
1995                                                            float *fshiftptr)
1996 {
1997         __m128 t1,t2,t3,t4,t5;
1998         
1999         /* transpose data */
2000         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);  
2001         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
2002         _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
2003         
2004         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2005         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
2006         fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
2007
2008         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
2009         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
2010         _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
2011         
2012         t5 = _mm_load_ss(fshiftptr+2);
2013         t5 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2014         
2015         t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));   /* fiy1 fix1  -   fiz1 */
2016         t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));   /* fiy3 fix3  -   fiz2 */
2017         t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));   /* fiy4 fix4  -   fiz3 */
2018         t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));   /* fiy2 fiy2 fix2 fix2 */
2019         t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));   /* fiy2 fix2  -   fiz4 */
2020         
2021         t1 = _mm_add_ps(t1,t2);
2022         t3 = _mm_add_ps(t3,t4);
2023         t1 = _mm_add_ps(t1,t3); /* y x - z */
2024         t5 = _mm_add_ps(t5,t1);
2025         
2026         _mm_store_ss(fshiftptr+2,t5);
2027         _mm_storeh_pi((__m64 *)(fshiftptr),t5);
2028 }
2029
2030
2031 static inline void
2032 gmx_mm_update_1pot_ps(__m128 pot1, float *ptr1)
2033 {
2034         pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1));
2035         pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
2036
2037         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
2038 }
2039                                    
2040 static inline void
2041 gmx_mm_update_2pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2)
2042 {
2043         __m128 t1,t2;
2044         t1   = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */
2045         t2   = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */
2046         t1   = _mm_add_ps(t1,t2);       /* 2  2  1  1  */
2047         t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
2048         pot1 = _mm_add_ps(t1,t2);       /* -  2  -  1  */
2049         pot2 = _mm_movehl_ps(t2,t1);    /* -  -  -  2  */
2050
2051         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
2052         _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
2053 }
2054
2055