/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse4_1_single.c
Location:	line 120, column 22
Description:	Value stored to 'one' during its initialization is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
54	* Electrostatics interaction: CubicSplineTable
55	* VdW interaction: LennardJones
56	* Geometry: Water3-Water3
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
107	real *charge;
108	int nvdwtype;
109	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110	int *vdwtype;
111	real *vdwparam;
112	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
113	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114	__m128i vfitab;
115	__m128i ifour = _mm_set1_epi32(4);
116	__m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
117	real *vftab;
118	__m128 dummy_mask,cutoff_mask;
119	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120	__m128 one = _mm_set1_ps(1.0);
	Value stored to 'one' during its initialization is never read
121	__m128 two = _mm_set1_ps(2.0);
122	x = xx[0];
123	f = ff[0];
124
125	nri = nlist->nri;
126	iinr = nlist->iinr;
127	jindex = nlist->jindex;
128	jjnr = nlist->jjnr;
129	shiftidx = nlist->shift;
130	gid = nlist->gid;
131	shiftvec = fr->shift_vec[0];
132	fshift = fr->fshift[0];
133	facel = _mm_set1_ps(fr->epsfac);
134	charge = mdatoms->chargeA;
135	nvdwtype = fr->ntype;
136	vdwparam = fr->nbfp;
137	vdwtype = mdatoms->typeA;
138
139	vftab = kernel_data->table_elec->data;
140	vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
141
142	/* Setup water-specific parameters */
143	inr = nlist->iinr[0];
144	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
148
149	jq0 = _mm_set1_ps(charge[inr+0]);
150	jq1 = _mm_set1_ps(charge[inr+1]);
151	jq2 = _mm_set1_ps(charge[inr+2]);
152	vdwjidx0A = 2*vdwtype[inr+0];
153	qq00 = _mm_mul_ps(iq0,jq0);
154	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156	qq01 = _mm_mul_ps(iq0,jq1);
157	qq02 = _mm_mul_ps(iq0,jq2);
158	qq10 = _mm_mul_ps(iq1,jq0);
159	qq11 = _mm_mul_ps(iq1,jq1);
160	qq12 = _mm_mul_ps(iq1,jq2);
161	qq20 = _mm_mul_ps(iq2,jq0);
162	qq21 = _mm_mul_ps(iq2,jq1);
163	qq22 = _mm_mul_ps(iq2,jq2);
164
165	/* Avoid stupid compiler warnings */
166	jnrA = jnrB = jnrC = jnrD = 0;
167	j_coord_offsetA = 0;
168	j_coord_offsetB = 0;
169	j_coord_offsetC = 0;
170	j_coord_offsetD = 0;
171
172	outeriter = 0;
173	inneriter = 0;
174
175	for(iidx=0;iidx<4*DIM3;iidx++)
176	{
177	scratch[iidx] = 0.0;
178	}
179
180	/* Start outer loop over neighborlists */
181	for(iidx=0; iidx<nri; iidx++)
182	{
183	/* Load shift vector for this list */
184	i_shift_offset = DIM3*shiftidx[iidx];
185
186	/* Load limits for loop over neighbors */
187	j_index_start = jindex[iidx];
188	j_index_end = jindex[iidx+1];
189
190	/* Get outer coordinate index */
191	inr = iinr[iidx];
192	i_coord_offset = DIM3*inr;
193
194	/* Load i particle coords and add shift vector */
195	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197
198	fix0 = _mm_setzero_ps();
199	fiy0 = _mm_setzero_ps();
200	fiz0 = _mm_setzero_ps();
201	fix1 = _mm_setzero_ps();
202	fiy1 = _mm_setzero_ps();
203	fiz1 = _mm_setzero_ps();
204	fix2 = _mm_setzero_ps();
205	fiy2 = _mm_setzero_ps();
206	fiz2 = _mm_setzero_ps();
207
208	/* Reset potential sums */
209	velecsum = _mm_setzero_ps();
210	vvdwsum = _mm_setzero_ps();
211
212	/* Start inner kernel loop */
213	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214	{
215
216	/* Get j neighbor index, and coordinate index */
217	jnrA = jjnr[jidx];
218	jnrB = jjnr[jidx+1];
219	jnrC = jjnr[jidx+2];
220	jnrD = jjnr[jidx+3];
221	j_coord_offsetA = DIM3*jnrA;
222	j_coord_offsetB = DIM3*jnrB;
223	j_coord_offsetC = DIM3*jnrC;
224	j_coord_offsetD = DIM3*jnrD;
225
226	/* load j atom coordinates */
227	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228	x+j_coord_offsetC,x+j_coord_offsetD,
229	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230
231	/* Calculate displacement vector */
232	dx00 = _mm_sub_ps(ix0,jx0);
233	dy00 = _mm_sub_ps(iy0,jy0);
234	dz00 = _mm_sub_ps(iz0,jz0);
235	dx01 = _mm_sub_ps(ix0,jx1);
236	dy01 = _mm_sub_ps(iy0,jy1);
237	dz01 = _mm_sub_ps(iz0,jz1);
238	dx02 = _mm_sub_ps(ix0,jx2);
239	dy02 = _mm_sub_ps(iy0,jy2);
240	dz02 = _mm_sub_ps(iz0,jz2);
241	dx10 = _mm_sub_ps(ix1,jx0);
242	dy10 = _mm_sub_ps(iy1,jy0);
243	dz10 = _mm_sub_ps(iz1,jz0);
244	dx11 = _mm_sub_ps(ix1,jx1);
245	dy11 = _mm_sub_ps(iy1,jy1);
246	dz11 = _mm_sub_ps(iz1,jz1);
247	dx12 = _mm_sub_ps(ix1,jx2);
248	dy12 = _mm_sub_ps(iy1,jy2);
249	dz12 = _mm_sub_ps(iz1,jz2);
250	dx20 = _mm_sub_ps(ix2,jx0);
251	dy20 = _mm_sub_ps(iy2,jy0);
252	dz20 = _mm_sub_ps(iz2,jz0);
253	dx21 = _mm_sub_ps(ix2,jx1);
254	dy21 = _mm_sub_ps(iy2,jy1);
255	dz21 = _mm_sub_ps(iz2,jz1);
256	dx22 = _mm_sub_ps(ix2,jx2);
257	dy22 = _mm_sub_ps(iy2,jy2);
258	dz22 = _mm_sub_ps(iz2,jz2);
259
260	/* Calculate squared distance and things based on it */
261	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
262	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
263	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
264	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
265	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
266	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
267	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
268	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
269	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270
271	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
272	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
273	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
274	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
275	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
276	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
277	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
278	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
279	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
280
281	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
282
283	fjx0 = _mm_setzero_ps();
284	fjy0 = _mm_setzero_ps();
285	fjz0 = _mm_setzero_ps();
286	fjx1 = _mm_setzero_ps();
287	fjy1 = _mm_setzero_ps();
288	fjz1 = _mm_setzero_ps();
289	fjx2 = _mm_setzero_ps();
290	fjy2 = _mm_setzero_ps();
291	fjz2 = _mm_setzero_ps();
292
293	/**************************
294	* CALCULATE INTERACTIONS *
295	**************************/
296
297	r00 = _mm_mul_ps(rsq00,rinv00);
298
299	/* Calculate table index by multiplying r with table scale and truncate to integer */
300	rt = _mm_mul_ps(r00,vftabscale);
301	vfitab = _mm_cvttps_epi32(rt);
302	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
303	vfitab = _mm_slli_epi32(vfitab,2);
304
305	/* CUBIC SPLINE TABLE ELECTROSTATICS */
306	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
307	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
308	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
309	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
310	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
311	Heps = _mm_mul_ps(vfeps,H);
312	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
313	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
314	velec = _mm_mul_ps(qq00,VV);
315	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
316	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
317
318	/* LENNARD-JONES DISPERSION/REPULSION */
319
320	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
321	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
322	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
323	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
324	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
325
326	/* Update potential sum for this i atom from the interaction with this j atom. */
327	velecsum = _mm_add_ps(velecsum,velec);
328	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
329
330	fscal = _mm_add_ps(felec,fvdw);
331
332	/* Calculate temporary vectorial force */
333	tx = _mm_mul_ps(fscal,dx00);
334	ty = _mm_mul_ps(fscal,dy00);
335	tz = _mm_mul_ps(fscal,dz00);
336
337	/* Update vectorial force */
338	fix0 = _mm_add_ps(fix0,tx);
339	fiy0 = _mm_add_ps(fiy0,ty);
340	fiz0 = _mm_add_ps(fiz0,tz);
341
342	fjx0 = _mm_add_ps(fjx0,tx);
343	fjy0 = _mm_add_ps(fjy0,ty);
344	fjz0 = _mm_add_ps(fjz0,tz);
345
346	/**************************
347	* CALCULATE INTERACTIONS *
348	**************************/
349
350	r01 = _mm_mul_ps(rsq01,rinv01);
351
352	/* Calculate table index by multiplying r with table scale and truncate to integer */
353	rt = _mm_mul_ps(r01,vftabscale);
354	vfitab = _mm_cvttps_epi32(rt);
355	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
356	vfitab = _mm_slli_epi32(vfitab,2);
357
358	/* CUBIC SPLINE TABLE ELECTROSTATICS */
359	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
360	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
361	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
362	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
363	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
364	Heps = _mm_mul_ps(vfeps,H);
365	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
366	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
367	velec = _mm_mul_ps(qq01,VV);
368	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
369	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
370
371	/* Update potential sum for this i atom from the interaction with this j atom. */
372	velecsum = _mm_add_ps(velecsum,velec);
373
374	fscal = felec;
375
376	/* Calculate temporary vectorial force */
377	tx = _mm_mul_ps(fscal,dx01);
378	ty = _mm_mul_ps(fscal,dy01);
379	tz = _mm_mul_ps(fscal,dz01);
380
381	/* Update vectorial force */
382	fix0 = _mm_add_ps(fix0,tx);
383	fiy0 = _mm_add_ps(fiy0,ty);
384	fiz0 = _mm_add_ps(fiz0,tz);
385
386	fjx1 = _mm_add_ps(fjx1,tx);
387	fjy1 = _mm_add_ps(fjy1,ty);
388	fjz1 = _mm_add_ps(fjz1,tz);
389
390	/**************************
391	* CALCULATE INTERACTIONS *
392	**************************/
393
394	r02 = _mm_mul_ps(rsq02,rinv02);
395
396	/* Calculate table index by multiplying r with table scale and truncate to integer */
397	rt = _mm_mul_ps(r02,vftabscale);
398	vfitab = _mm_cvttps_epi32(rt);
399	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
400	vfitab = _mm_slli_epi32(vfitab,2);
401
402	/* CUBIC SPLINE TABLE ELECTROSTATICS */
403	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
404	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
405	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
406	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
407	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
408	Heps = _mm_mul_ps(vfeps,H);
409	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
410	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
411	velec = _mm_mul_ps(qq02,VV);
412	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
413	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
414
415	/* Update potential sum for this i atom from the interaction with this j atom. */
416	velecsum = _mm_add_ps(velecsum,velec);
417
418	fscal = felec;
419
420	/* Calculate temporary vectorial force */
421	tx = _mm_mul_ps(fscal,dx02);
422	ty = _mm_mul_ps(fscal,dy02);
423	tz = _mm_mul_ps(fscal,dz02);
424
425	/* Update vectorial force */
426	fix0 = _mm_add_ps(fix0,tx);
427	fiy0 = _mm_add_ps(fiy0,ty);
428	fiz0 = _mm_add_ps(fiz0,tz);
429
430	fjx2 = _mm_add_ps(fjx2,tx);
431	fjy2 = _mm_add_ps(fjy2,ty);
432	fjz2 = _mm_add_ps(fjz2,tz);
433
434	/**************************
435	* CALCULATE INTERACTIONS *
436	**************************/
437
438	r10 = _mm_mul_ps(rsq10,rinv10);
439
440	/* Calculate table index by multiplying r with table scale and truncate to integer */
441	rt = _mm_mul_ps(r10,vftabscale);
442	vfitab = _mm_cvttps_epi32(rt);
443	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
444	vfitab = _mm_slli_epi32(vfitab,2);
445
446	/* CUBIC SPLINE TABLE ELECTROSTATICS */
447	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
448	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
449	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
450	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
451	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
452	Heps = _mm_mul_ps(vfeps,H);
453	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
454	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
455	velec = _mm_mul_ps(qq10,VV);
456	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
457	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
458
459	/* Update potential sum for this i atom from the interaction with this j atom. */
460	velecsum = _mm_add_ps(velecsum,velec);
461
462	fscal = felec;
463
464	/* Calculate temporary vectorial force */
465	tx = _mm_mul_ps(fscal,dx10);
466	ty = _mm_mul_ps(fscal,dy10);
467	tz = _mm_mul_ps(fscal,dz10);
468
469	/* Update vectorial force */
470	fix1 = _mm_add_ps(fix1,tx);
471	fiy1 = _mm_add_ps(fiy1,ty);
472	fiz1 = _mm_add_ps(fiz1,tz);
473
474	fjx0 = _mm_add_ps(fjx0,tx);
475	fjy0 = _mm_add_ps(fjy0,ty);
476	fjz0 = _mm_add_ps(fjz0,tz);
477
478	/**************************
479	* CALCULATE INTERACTIONS *
480	**************************/
481
482	r11 = _mm_mul_ps(rsq11,rinv11);
483
484	/* Calculate table index by multiplying r with table scale and truncate to integer */
485	rt = _mm_mul_ps(r11,vftabscale);
486	vfitab = _mm_cvttps_epi32(rt);
487	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
488	vfitab = _mm_slli_epi32(vfitab,2);
489
490	/* CUBIC SPLINE TABLE ELECTROSTATICS */
491	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
492	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
493	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
494	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
495	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
496	Heps = _mm_mul_ps(vfeps,H);
497	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
498	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
499	velec = _mm_mul_ps(qq11,VV);
500	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
501	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
502
503	/* Update potential sum for this i atom from the interaction with this j atom. */
504	velecsum = _mm_add_ps(velecsum,velec);
505
506	fscal = felec;
507
508	/* Calculate temporary vectorial force */
509	tx = _mm_mul_ps(fscal,dx11);
510	ty = _mm_mul_ps(fscal,dy11);
511	tz = _mm_mul_ps(fscal,dz11);
512
513	/* Update vectorial force */
514	fix1 = _mm_add_ps(fix1,tx);
515	fiy1 = _mm_add_ps(fiy1,ty);
516	fiz1 = _mm_add_ps(fiz1,tz);
517
518	fjx1 = _mm_add_ps(fjx1,tx);
519	fjy1 = _mm_add_ps(fjy1,ty);
520	fjz1 = _mm_add_ps(fjz1,tz);
521
522	/**************************
523	* CALCULATE INTERACTIONS *
524	**************************/
525
526	r12 = _mm_mul_ps(rsq12,rinv12);
527
528	/* Calculate table index by multiplying r with table scale and truncate to integer */
529	rt = _mm_mul_ps(r12,vftabscale);
530	vfitab = _mm_cvttps_epi32(rt);
531	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
532	vfitab = _mm_slli_epi32(vfitab,2);
533
534	/* CUBIC SPLINE TABLE ELECTROSTATICS */
535	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
536	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
537	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
538	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
539	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
540	Heps = _mm_mul_ps(vfeps,H);
541	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
542	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
543	velec = _mm_mul_ps(qq12,VV);
544	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
545	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
546
547	/* Update potential sum for this i atom from the interaction with this j atom. */
548	velecsum = _mm_add_ps(velecsum,velec);
549
550	fscal = felec;
551
552	/* Calculate temporary vectorial force */
553	tx = _mm_mul_ps(fscal,dx12);
554	ty = _mm_mul_ps(fscal,dy12);
555	tz = _mm_mul_ps(fscal,dz12);
556
557	/* Update vectorial force */
558	fix1 = _mm_add_ps(fix1,tx);
559	fiy1 = _mm_add_ps(fiy1,ty);
560	fiz1 = _mm_add_ps(fiz1,tz);
561
562	fjx2 = _mm_add_ps(fjx2,tx);
563	fjy2 = _mm_add_ps(fjy2,ty);
564	fjz2 = _mm_add_ps(fjz2,tz);
565
566	/**************************
567	* CALCULATE INTERACTIONS *
568	**************************/
569
570	r20 = _mm_mul_ps(rsq20,rinv20);
571
572	/* Calculate table index by multiplying r with table scale and truncate to integer */
573	rt = _mm_mul_ps(r20,vftabscale);
574	vfitab = _mm_cvttps_epi32(rt);
575	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
576	vfitab = _mm_slli_epi32(vfitab,2);
577
578	/* CUBIC SPLINE TABLE ELECTROSTATICS */
579	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
580	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
581	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
582	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
583	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
584	Heps = _mm_mul_ps(vfeps,H);
585	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
586	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
587	velec = _mm_mul_ps(qq20,VV);
588	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
589	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
590
591	/* Update potential sum for this i atom from the interaction with this j atom. */
592	velecsum = _mm_add_ps(velecsum,velec);
593
594	fscal = felec;
595
596	/* Calculate temporary vectorial force */
597	tx = _mm_mul_ps(fscal,dx20);
598	ty = _mm_mul_ps(fscal,dy20);
599	tz = _mm_mul_ps(fscal,dz20);
600
601	/* Update vectorial force */
602	fix2 = _mm_add_ps(fix2,tx);
603	fiy2 = _mm_add_ps(fiy2,ty);
604	fiz2 = _mm_add_ps(fiz2,tz);
605
606	fjx0 = _mm_add_ps(fjx0,tx);
607	fjy0 = _mm_add_ps(fjy0,ty);
608	fjz0 = _mm_add_ps(fjz0,tz);
609
610	/**************************
611	* CALCULATE INTERACTIONS *
612	**************************/
613
614	r21 = _mm_mul_ps(rsq21,rinv21);
615
616	/* Calculate table index by multiplying r with table scale and truncate to integer */
617	rt = _mm_mul_ps(r21,vftabscale);
618	vfitab = _mm_cvttps_epi32(rt);
619	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
620	vfitab = _mm_slli_epi32(vfitab,2);
621
622	/* CUBIC SPLINE TABLE ELECTROSTATICS */
623	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
624	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
625	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
626	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
627	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
628	Heps = _mm_mul_ps(vfeps,H);
629	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
630	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
631	velec = _mm_mul_ps(qq21,VV);
632	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
633	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
634
635	/* Update potential sum for this i atom from the interaction with this j atom. */
636	velecsum = _mm_add_ps(velecsum,velec);
637
638	fscal = felec;
639
640	/* Calculate temporary vectorial force */
641	tx = _mm_mul_ps(fscal,dx21);
642	ty = _mm_mul_ps(fscal,dy21);
643	tz = _mm_mul_ps(fscal,dz21);
644
645	/* Update vectorial force */
646	fix2 = _mm_add_ps(fix2,tx);
647	fiy2 = _mm_add_ps(fiy2,ty);
648	fiz2 = _mm_add_ps(fiz2,tz);
649
650	fjx1 = _mm_add_ps(fjx1,tx);
651	fjy1 = _mm_add_ps(fjy1,ty);
652	fjz1 = _mm_add_ps(fjz1,tz);
653
654	/**************************
655	* CALCULATE INTERACTIONS *
656	**************************/
657
658	r22 = _mm_mul_ps(rsq22,rinv22);
659
660	/* Calculate table index by multiplying r with table scale and truncate to integer */
661	rt = _mm_mul_ps(r22,vftabscale);
662	vfitab = _mm_cvttps_epi32(rt);
663	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
664	vfitab = _mm_slli_epi32(vfitab,2);
665
666	/* CUBIC SPLINE TABLE ELECTROSTATICS */
667	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
668	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
669	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
670	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
671	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
672	Heps = _mm_mul_ps(vfeps,H);
673	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
674	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
675	velec = _mm_mul_ps(qq22,VV);
676	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
677	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
678
679	/* Update potential sum for this i atom from the interaction with this j atom. */
680	velecsum = _mm_add_ps(velecsum,velec);
681
682	fscal = felec;
683
684	/* Calculate temporary vectorial force */
685	tx = _mm_mul_ps(fscal,dx22);
686	ty = _mm_mul_ps(fscal,dy22);
687	tz = _mm_mul_ps(fscal,dz22);
688
689	/* Update vectorial force */
690	fix2 = _mm_add_ps(fix2,tx);
691	fiy2 = _mm_add_ps(fiy2,ty);
692	fiz2 = _mm_add_ps(fiz2,tz);
693
694	fjx2 = _mm_add_ps(fjx2,tx);
695	fjy2 = _mm_add_ps(fjy2,ty);
696	fjz2 = _mm_add_ps(fjz2,tz);
697
698	fjptrA = f+j_coord_offsetA;
699	fjptrB = f+j_coord_offsetB;
700	fjptrC = f+j_coord_offsetC;
701	fjptrD = f+j_coord_offsetD;
702
703	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
704	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
705
706	/* Inner loop uses 400 flops */
707	}
708
709	if(jidx<j_index_end)
710	{
711
712	/* Get j neighbor index, and coordinate index */
713	jnrlistA = jjnr[jidx];
714	jnrlistB = jjnr[jidx+1];
715	jnrlistC = jjnr[jidx+2];
716	jnrlistD = jjnr[jidx+3];
717	/* Sign of each element will be negative for non-real atoms.
718	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
719	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
720	*/
721	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
722	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
723	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
724	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
725	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
726	j_coord_offsetA = DIM3*jnrA;
727	j_coord_offsetB = DIM3*jnrB;
728	j_coord_offsetC = DIM3*jnrC;
729	j_coord_offsetD = DIM3*jnrD;
730
731	/* load j atom coordinates */
732	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
733	x+j_coord_offsetC,x+j_coord_offsetD,
734	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
735
736	/* Calculate displacement vector */
737	dx00 = _mm_sub_ps(ix0,jx0);
738	dy00 = _mm_sub_ps(iy0,jy0);
739	dz00 = _mm_sub_ps(iz0,jz0);
740	dx01 = _mm_sub_ps(ix0,jx1);
741	dy01 = _mm_sub_ps(iy0,jy1);
742	dz01 = _mm_sub_ps(iz0,jz1);
743	dx02 = _mm_sub_ps(ix0,jx2);
744	dy02 = _mm_sub_ps(iy0,jy2);
745	dz02 = _mm_sub_ps(iz0,jz2);
746	dx10 = _mm_sub_ps(ix1,jx0);
747	dy10 = _mm_sub_ps(iy1,jy0);
748	dz10 = _mm_sub_ps(iz1,jz0);
749	dx11 = _mm_sub_ps(ix1,jx1);
750	dy11 = _mm_sub_ps(iy1,jy1);
751	dz11 = _mm_sub_ps(iz1,jz1);
752	dx12 = _mm_sub_ps(ix1,jx2);
753	dy12 = _mm_sub_ps(iy1,jy2);
754	dz12 = _mm_sub_ps(iz1,jz2);
755	dx20 = _mm_sub_ps(ix2,jx0);
756	dy20 = _mm_sub_ps(iy2,jy0);
757	dz20 = _mm_sub_ps(iz2,jz0);
758	dx21 = _mm_sub_ps(ix2,jx1);
759	dy21 = _mm_sub_ps(iy2,jy1);
760	dz21 = _mm_sub_ps(iz2,jz1);
761	dx22 = _mm_sub_ps(ix2,jx2);
762	dy22 = _mm_sub_ps(iy2,jy2);
763	dz22 = _mm_sub_ps(iz2,jz2);
764
765	/* Calculate squared distance and things based on it */
766	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
767	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
768	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
769	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
770	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
771	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
772	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
773	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
774	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
775
776	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
777	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
778	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
779	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
780	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
781	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
782	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
783	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
784	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
785
786	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
787
788	fjx0 = _mm_setzero_ps();
789	fjy0 = _mm_setzero_ps();
790	fjz0 = _mm_setzero_ps();
791	fjx1 = _mm_setzero_ps();
792	fjy1 = _mm_setzero_ps();
793	fjz1 = _mm_setzero_ps();
794	fjx2 = _mm_setzero_ps();
795	fjy2 = _mm_setzero_ps();
796	fjz2 = _mm_setzero_ps();
797
798	/**************************
799	* CALCULATE INTERACTIONS *
800	**************************/
801
802	r00 = _mm_mul_ps(rsq00,rinv00);
803	r00 = _mm_andnot_ps(dummy_mask,r00);
804
805	/* Calculate table index by multiplying r with table scale and truncate to integer */
806	rt = _mm_mul_ps(r00,vftabscale);
807	vfitab = _mm_cvttps_epi32(rt);
808	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
809	vfitab = _mm_slli_epi32(vfitab,2);
810
811	/* CUBIC SPLINE TABLE ELECTROSTATICS */
812	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
813	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
814	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
815	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
816	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
817	Heps = _mm_mul_ps(vfeps,H);
818	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
819	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
820	velec = _mm_mul_ps(qq00,VV);
821	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
822	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
823
824	/* LENNARD-JONES DISPERSION/REPULSION */
825
826	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
827	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
828	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
829	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
830	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
831
832	/* Update potential sum for this i atom from the interaction with this j atom. */
833	velec = _mm_andnot_ps(dummy_mask,velec);
834	velecsum = _mm_add_ps(velecsum,velec);
835	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
836	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
837
838	fscal = _mm_add_ps(felec,fvdw);
839
840	fscal = _mm_andnot_ps(dummy_mask,fscal);
841
842	/* Calculate temporary vectorial force */
843	tx = _mm_mul_ps(fscal,dx00);
844	ty = _mm_mul_ps(fscal,dy00);
845	tz = _mm_mul_ps(fscal,dz00);
846
847	/* Update vectorial force */
848	fix0 = _mm_add_ps(fix0,tx);
849	fiy0 = _mm_add_ps(fiy0,ty);
850	fiz0 = _mm_add_ps(fiz0,tz);
851
852	fjx0 = _mm_add_ps(fjx0,tx);
853	fjy0 = _mm_add_ps(fjy0,ty);
854	fjz0 = _mm_add_ps(fjz0,tz);
855
856	/**************************
857	* CALCULATE INTERACTIONS *
858	**************************/
859
860	r01 = _mm_mul_ps(rsq01,rinv01);
861	r01 = _mm_andnot_ps(dummy_mask,r01);
862
863	/* Calculate table index by multiplying r with table scale and truncate to integer */
864	rt = _mm_mul_ps(r01,vftabscale);
865	vfitab = _mm_cvttps_epi32(rt);
866	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
867	vfitab = _mm_slli_epi32(vfitab,2);
868
869	/* CUBIC SPLINE TABLE ELECTROSTATICS */
870	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
871	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
872	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
873	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
874	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
875	Heps = _mm_mul_ps(vfeps,H);
876	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
877	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
878	velec = _mm_mul_ps(qq01,VV);
879	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
880	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
881
882	/* Update potential sum for this i atom from the interaction with this j atom. */
883	velec = _mm_andnot_ps(dummy_mask,velec);
884	velecsum = _mm_add_ps(velecsum,velec);
885
886	fscal = felec;
887
888	fscal = _mm_andnot_ps(dummy_mask,fscal);
889
890	/* Calculate temporary vectorial force */
891	tx = _mm_mul_ps(fscal,dx01);
892	ty = _mm_mul_ps(fscal,dy01);
893	tz = _mm_mul_ps(fscal,dz01);
894
895	/* Update vectorial force */
896	fix0 = _mm_add_ps(fix0,tx);
897	fiy0 = _mm_add_ps(fiy0,ty);
898	fiz0 = _mm_add_ps(fiz0,tz);
899
900	fjx1 = _mm_add_ps(fjx1,tx);
901	fjy1 = _mm_add_ps(fjy1,ty);
902	fjz1 = _mm_add_ps(fjz1,tz);
903
904	/**************************
905	* CALCULATE INTERACTIONS *
906	**************************/
907
908	r02 = _mm_mul_ps(rsq02,rinv02);
909	r02 = _mm_andnot_ps(dummy_mask,r02);
910
911	/* Calculate table index by multiplying r with table scale and truncate to integer */
912	rt = _mm_mul_ps(r02,vftabscale);
913	vfitab = _mm_cvttps_epi32(rt);
914	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
915	vfitab = _mm_slli_epi32(vfitab,2);
916
917	/* CUBIC SPLINE TABLE ELECTROSTATICS */
918	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
919	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
920	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
921	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
922	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
923	Heps = _mm_mul_ps(vfeps,H);
924	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
925	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
926	velec = _mm_mul_ps(qq02,VV);
927	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
928	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
929
930	/* Update potential sum for this i atom from the interaction with this j atom. */
931	velec = _mm_andnot_ps(dummy_mask,velec);
932	velecsum = _mm_add_ps(velecsum,velec);
933
934	fscal = felec;
935
936	fscal = _mm_andnot_ps(dummy_mask,fscal);
937
938	/* Calculate temporary vectorial force */
939	tx = _mm_mul_ps(fscal,dx02);
940	ty = _mm_mul_ps(fscal,dy02);
941	tz = _mm_mul_ps(fscal,dz02);
942
943	/* Update vectorial force */
944	fix0 = _mm_add_ps(fix0,tx);
945	fiy0 = _mm_add_ps(fiy0,ty);
946	fiz0 = _mm_add_ps(fiz0,tz);
947
948	fjx2 = _mm_add_ps(fjx2,tx);
949	fjy2 = _mm_add_ps(fjy2,ty);
950	fjz2 = _mm_add_ps(fjz2,tz);
951
952	/**************************
953	* CALCULATE INTERACTIONS *
954	**************************/
955
956	r10 = _mm_mul_ps(rsq10,rinv10);
957	r10 = _mm_andnot_ps(dummy_mask,r10);
958
959	/* Calculate table index by multiplying r with table scale and truncate to integer */
960	rt = _mm_mul_ps(r10,vftabscale);
961	vfitab = _mm_cvttps_epi32(rt);
962	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
963	vfitab = _mm_slli_epi32(vfitab,2);
964
965	/* CUBIC SPLINE TABLE ELECTROSTATICS */
966	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
967	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
968	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
969	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
970	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
971	Heps = _mm_mul_ps(vfeps,H);
972	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
973	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
974	velec = _mm_mul_ps(qq10,VV);
975	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
976	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
977
978	/* Update potential sum for this i atom from the interaction with this j atom. */
979	velec = _mm_andnot_ps(dummy_mask,velec);
980	velecsum = _mm_add_ps(velecsum,velec);
981
982	fscal = felec;
983
984	fscal = _mm_andnot_ps(dummy_mask,fscal);
985
986	/* Calculate temporary vectorial force */
987	tx = _mm_mul_ps(fscal,dx10);
988	ty = _mm_mul_ps(fscal,dy10);
989	tz = _mm_mul_ps(fscal,dz10);
990
991	/* Update vectorial force */
992	fix1 = _mm_add_ps(fix1,tx);
993	fiy1 = _mm_add_ps(fiy1,ty);
994	fiz1 = _mm_add_ps(fiz1,tz);
995
996	fjx0 = _mm_add_ps(fjx0,tx);
997	fjy0 = _mm_add_ps(fjy0,ty);
998	fjz0 = _mm_add_ps(fjz0,tz);
999
1000	/**************************
1001	* CALCULATE INTERACTIONS *
1002	**************************/
1003
1004	r11 = _mm_mul_ps(rsq11,rinv11);
1005	r11 = _mm_andnot_ps(dummy_mask,r11);
1006
1007	/* Calculate table index by multiplying r with table scale and truncate to integer */
1008	rt = _mm_mul_ps(r11,vftabscale);
1009	vfitab = _mm_cvttps_epi32(rt);
1010	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1011	vfitab = _mm_slli_epi32(vfitab,2);
1012
1013	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1014	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1015	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1016	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1017	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1018	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1019	Heps = _mm_mul_ps(vfeps,H);
1020	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1021	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1022	velec = _mm_mul_ps(qq11,VV);
1023	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1024	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1025
1026	/* Update potential sum for this i atom from the interaction with this j atom. */
1027	velec = _mm_andnot_ps(dummy_mask,velec);
1028	velecsum = _mm_add_ps(velecsum,velec);
1029
1030	fscal = felec;
1031
1032	fscal = _mm_andnot_ps(dummy_mask,fscal);
1033
1034	/* Calculate temporary vectorial force */
1035	tx = _mm_mul_ps(fscal,dx11);
1036	ty = _mm_mul_ps(fscal,dy11);
1037	tz = _mm_mul_ps(fscal,dz11);
1038
1039	/* Update vectorial force */
1040	fix1 = _mm_add_ps(fix1,tx);
1041	fiy1 = _mm_add_ps(fiy1,ty);
1042	fiz1 = _mm_add_ps(fiz1,tz);
1043
1044	fjx1 = _mm_add_ps(fjx1,tx);
1045	fjy1 = _mm_add_ps(fjy1,ty);
1046	fjz1 = _mm_add_ps(fjz1,tz);
1047
1048	/**************************
1049	* CALCULATE INTERACTIONS *
1050	**************************/
1051
1052	r12 = _mm_mul_ps(rsq12,rinv12);
1053	r12 = _mm_andnot_ps(dummy_mask,r12);
1054
1055	/* Calculate table index by multiplying r with table scale and truncate to integer */
1056	rt = _mm_mul_ps(r12,vftabscale);
1057	vfitab = _mm_cvttps_epi32(rt);
1058	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1059	vfitab = _mm_slli_epi32(vfitab,2);
1060
1061	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1062	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1063	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1064	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1065	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1066	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1067	Heps = _mm_mul_ps(vfeps,H);
1068	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1069	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1070	velec = _mm_mul_ps(qq12,VV);
1071	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1072	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1073
1074	/* Update potential sum for this i atom from the interaction with this j atom. */
1075	velec = _mm_andnot_ps(dummy_mask,velec);
1076	velecsum = _mm_add_ps(velecsum,velec);
1077
1078	fscal = felec;
1079
1080	fscal = _mm_andnot_ps(dummy_mask,fscal);
1081
1082	/* Calculate temporary vectorial force */
1083	tx = _mm_mul_ps(fscal,dx12);
1084	ty = _mm_mul_ps(fscal,dy12);
1085	tz = _mm_mul_ps(fscal,dz12);
1086
1087	/* Update vectorial force */
1088	fix1 = _mm_add_ps(fix1,tx);
1089	fiy1 = _mm_add_ps(fiy1,ty);
1090	fiz1 = _mm_add_ps(fiz1,tz);
1091
1092	fjx2 = _mm_add_ps(fjx2,tx);
1093	fjy2 = _mm_add_ps(fjy2,ty);
1094	fjz2 = _mm_add_ps(fjz2,tz);
1095
1096	/**************************
1097	* CALCULATE INTERACTIONS *
1098	**************************/
1099
1100	r20 = _mm_mul_ps(rsq20,rinv20);
1101	r20 = _mm_andnot_ps(dummy_mask,r20);
1102
1103	/* Calculate table index by multiplying r with table scale and truncate to integer */
1104	rt = _mm_mul_ps(r20,vftabscale);
1105	vfitab = _mm_cvttps_epi32(rt);
1106	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1107	vfitab = _mm_slli_epi32(vfitab,2);
1108
1109	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1110	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1111	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1112	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1113	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1114	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1115	Heps = _mm_mul_ps(vfeps,H);
1116	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1117	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1118	velec = _mm_mul_ps(qq20,VV);
1119	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1120	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1121
1122	/* Update potential sum for this i atom from the interaction with this j atom. */
1123	velec = _mm_andnot_ps(dummy_mask,velec);
1124	velecsum = _mm_add_ps(velecsum,velec);
1125
1126	fscal = felec;
1127
1128	fscal = _mm_andnot_ps(dummy_mask,fscal);
1129
1130	/* Calculate temporary vectorial force */
1131	tx = _mm_mul_ps(fscal,dx20);
1132	ty = _mm_mul_ps(fscal,dy20);
1133	tz = _mm_mul_ps(fscal,dz20);
1134
1135	/* Update vectorial force */
1136	fix2 = _mm_add_ps(fix2,tx);
1137	fiy2 = _mm_add_ps(fiy2,ty);
1138	fiz2 = _mm_add_ps(fiz2,tz);
1139
1140	fjx0 = _mm_add_ps(fjx0,tx);
1141	fjy0 = _mm_add_ps(fjy0,ty);
1142	fjz0 = _mm_add_ps(fjz0,tz);
1143
1144	/**************************
1145	* CALCULATE INTERACTIONS *
1146	**************************/
1147
1148	r21 = _mm_mul_ps(rsq21,rinv21);
1149	r21 = _mm_andnot_ps(dummy_mask,r21);
1150
1151	/* Calculate table index by multiplying r with table scale and truncate to integer */
1152	rt = _mm_mul_ps(r21,vftabscale);
1153	vfitab = _mm_cvttps_epi32(rt);
1154	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1155	vfitab = _mm_slli_epi32(vfitab,2);
1156
1157	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1158	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1159	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1160	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1161	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1162	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1163	Heps = _mm_mul_ps(vfeps,H);
1164	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1165	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1166	velec = _mm_mul_ps(qq21,VV);
1167	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1168	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1169
1170	/* Update potential sum for this i atom from the interaction with this j atom. */
1171	velec = _mm_andnot_ps(dummy_mask,velec);
1172	velecsum = _mm_add_ps(velecsum,velec);
1173
1174	fscal = felec;
1175
1176	fscal = _mm_andnot_ps(dummy_mask,fscal);
1177
1178	/* Calculate temporary vectorial force */
1179	tx = _mm_mul_ps(fscal,dx21);
1180	ty = _mm_mul_ps(fscal,dy21);
1181	tz = _mm_mul_ps(fscal,dz21);
1182
1183	/* Update vectorial force */
1184	fix2 = _mm_add_ps(fix2,tx);
1185	fiy2 = _mm_add_ps(fiy2,ty);
1186	fiz2 = _mm_add_ps(fiz2,tz);
1187
1188	fjx1 = _mm_add_ps(fjx1,tx);
1189	fjy1 = _mm_add_ps(fjy1,ty);
1190	fjz1 = _mm_add_ps(fjz1,tz);
1191
1192	/**************************
1193	* CALCULATE INTERACTIONS *
1194	**************************/
1195
1196	r22 = _mm_mul_ps(rsq22,rinv22);
1197	r22 = _mm_andnot_ps(dummy_mask,r22);
1198
1199	/* Calculate table index by multiplying r with table scale and truncate to integer */
1200	rt = _mm_mul_ps(r22,vftabscale);
1201	vfitab = _mm_cvttps_epi32(rt);
1202	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1203	vfitab = _mm_slli_epi32(vfitab,2);
1204
1205	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1206	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1207	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1208	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1209	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1210	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1211	Heps = _mm_mul_ps(vfeps,H);
1212	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1213	VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1214	velec = _mm_mul_ps(qq22,VV);
1215	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1216	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1217
1218	/* Update potential sum for this i atom from the interaction with this j atom. */
1219	velec = _mm_andnot_ps(dummy_mask,velec);
1220	velecsum = _mm_add_ps(velecsum,velec);
1221
1222	fscal = felec;
1223
1224	fscal = _mm_andnot_ps(dummy_mask,fscal);
1225
1226	/* Calculate temporary vectorial force */
1227	tx = _mm_mul_ps(fscal,dx22);
1228	ty = _mm_mul_ps(fscal,dy22);
1229	tz = _mm_mul_ps(fscal,dz22);
1230
1231	/* Update vectorial force */
1232	fix2 = _mm_add_ps(fix2,tx);
1233	fiy2 = _mm_add_ps(fiy2,ty);
1234	fiz2 = _mm_add_ps(fiz2,tz);
1235
1236	fjx2 = _mm_add_ps(fjx2,tx);
1237	fjy2 = _mm_add_ps(fjy2,ty);
1238	fjz2 = _mm_add_ps(fjz2,tz);
1239
1240	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1241	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1242	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1243	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1244
1245	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1246	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1247
1248	/* Inner loop uses 409 flops */
1249	}
1250
1251	/* End of innermost loop */
1252
1253	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1254	f+i_coord_offset,fshift+i_shift_offset);
1255
1256	ggid = gid[iidx];
1257	/* Update potential energies */
1258	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1259	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1260
1261	/* Increment number of inner iterations */
1262	inneriter += j_index_end - j_index_start;
1263
1264	/* Outer loop uses 20 flops */
1265	}
1266
1267	/* Increment number of outer iterations */
1268	outeriter += nri;
1269
1270	/* Update outer/inner flops */
1271
1272	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter20 + inneriter409)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter20 + inneriter409;
1273	}
1274	/*
1275	* Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1276	* Electrostatics interaction: CubicSplineTable
1277	* VdW interaction: LennardJones
1278	* Geometry: Water3-Water3
1279	* Calculate force/pot: Force
1280	*/
1281	void
1282	nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1283	(t_nblist * gmx_restrict nlist,
1284	rvec * gmx_restrict xx,
1285	rvec * gmx_restrict ff,
1286	t_forcerec * gmx_restrict fr,
1287	t_mdatoms * gmx_restrict mdatoms,
1288	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1289	t_nrnb * gmx_restrict nrnb)
1290	{
1291	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1292	* just 0 for non-waters.
1293	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1294	* jnr indices corresponding to data put in the four positions in the SIMD register.
1295	*/
1296	int i_shift_offset,i_coord_offset,outeriter,inneriter;
1297	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1298	int jnrA,jnrB,jnrC,jnrD;
1299	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1300	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1301	int iinr,jindex,jjnr,shiftidx,*gid;
1302	real rcutoff_scalar;
1303	real shiftvec,fshift,x,f;
1304	real fjptrA,fjptrB,fjptrC,fjptrD;
1305	real scratch[4*DIM3];
1306	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1307	int vdwioffset0;
1308	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1309	int vdwioffset1;
1310	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1311	int vdwioffset2;
1312	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1313	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1314	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1315	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1316	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1317	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1318	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1319	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1320	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1321	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1322	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1323	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1324	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1325	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1326	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1327	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1328	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
1329	real *charge;
1330	int nvdwtype;
1331	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1332	int *vdwtype;
1333	real *vdwparam;
1334	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
1335	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1336	__m128i vfitab;
1337	__m128i ifour = _mm_set1_epi32(4);
1338	__m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1339	real *vftab;
1340	__m128 dummy_mask,cutoff_mask;
1341	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1342	__m128 one = _mm_set1_ps(1.0);
1343	__m128 two = _mm_set1_ps(2.0);
1344	x = xx[0];
1345	f = ff[0];
1346
1347	nri = nlist->nri;
1348	iinr = nlist->iinr;
1349	jindex = nlist->jindex;
1350	jjnr = nlist->jjnr;
1351	shiftidx = nlist->shift;
1352	gid = nlist->gid;
1353	shiftvec = fr->shift_vec[0];
1354	fshift = fr->fshift[0];
1355	facel = _mm_set1_ps(fr->epsfac);
1356	charge = mdatoms->chargeA;
1357	nvdwtype = fr->ntype;
1358	vdwparam = fr->nbfp;
1359	vdwtype = mdatoms->typeA;
1360
1361	vftab = kernel_data->table_elec->data;
1362	vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1363
1364	/* Setup water-specific parameters */
1365	inr = nlist->iinr[0];
1366	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1367	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1368	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1369	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
1370
1371	jq0 = _mm_set1_ps(charge[inr+0]);
1372	jq1 = _mm_set1_ps(charge[inr+1]);
1373	jq2 = _mm_set1_ps(charge[inr+2]);
1374	vdwjidx0A = 2*vdwtype[inr+0];
1375	qq00 = _mm_mul_ps(iq0,jq0);
1376	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1377	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1378	qq01 = _mm_mul_ps(iq0,jq1);
1379	qq02 = _mm_mul_ps(iq0,jq2);
1380	qq10 = _mm_mul_ps(iq1,jq0);
1381	qq11 = _mm_mul_ps(iq1,jq1);
1382	qq12 = _mm_mul_ps(iq1,jq2);
1383	qq20 = _mm_mul_ps(iq2,jq0);
1384	qq21 = _mm_mul_ps(iq2,jq1);
1385	qq22 = _mm_mul_ps(iq2,jq2);
1386
1387	/* Avoid stupid compiler warnings */
1388	jnrA = jnrB = jnrC = jnrD = 0;
1389	j_coord_offsetA = 0;
1390	j_coord_offsetB = 0;
1391	j_coord_offsetC = 0;
1392	j_coord_offsetD = 0;
1393
1394	outeriter = 0;
1395	inneriter = 0;
1396
1397	for(iidx=0;iidx<4*DIM3;iidx++)
1398	{
1399	scratch[iidx] = 0.0;
1400	}
1401
1402	/* Start outer loop over neighborlists */
1403	for(iidx=0; iidx<nri; iidx++)
1404	{
1405	/* Load shift vector for this list */
1406	i_shift_offset = DIM3*shiftidx[iidx];
1407
1408	/* Load limits for loop over neighbors */
1409	j_index_start = jindex[iidx];
1410	j_index_end = jindex[iidx+1];
1411
1412	/* Get outer coordinate index */
1413	inr = iinr[iidx];
1414	i_coord_offset = DIM3*inr;
1415
1416	/* Load i particle coords and add shift vector */
1417	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1418	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1419
1420	fix0 = _mm_setzero_ps();
1421	fiy0 = _mm_setzero_ps();
1422	fiz0 = _mm_setzero_ps();
1423	fix1 = _mm_setzero_ps();
1424	fiy1 = _mm_setzero_ps();
1425	fiz1 = _mm_setzero_ps();
1426	fix2 = _mm_setzero_ps();
1427	fiy2 = _mm_setzero_ps();
1428	fiz2 = _mm_setzero_ps();
1429
1430	/* Start inner kernel loop */
1431	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1432	{
1433
1434	/* Get j neighbor index, and coordinate index */
1435	jnrA = jjnr[jidx];
1436	jnrB = jjnr[jidx+1];
1437	jnrC = jjnr[jidx+2];
1438	jnrD = jjnr[jidx+3];
1439	j_coord_offsetA = DIM3*jnrA;
1440	j_coord_offsetB = DIM3*jnrB;
1441	j_coord_offsetC = DIM3*jnrC;
1442	j_coord_offsetD = DIM3*jnrD;
1443
1444	/* load j atom coordinates */
1445	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1446	x+j_coord_offsetC,x+j_coord_offsetD,
1447	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1448
1449	/* Calculate displacement vector */
1450	dx00 = _mm_sub_ps(ix0,jx0);
1451	dy00 = _mm_sub_ps(iy0,jy0);
1452	dz00 = _mm_sub_ps(iz0,jz0);
1453	dx01 = _mm_sub_ps(ix0,jx1);
1454	dy01 = _mm_sub_ps(iy0,jy1);
1455	dz01 = _mm_sub_ps(iz0,jz1);
1456	dx02 = _mm_sub_ps(ix0,jx2);
1457	dy02 = _mm_sub_ps(iy0,jy2);
1458	dz02 = _mm_sub_ps(iz0,jz2);
1459	dx10 = _mm_sub_ps(ix1,jx0);
1460	dy10 = _mm_sub_ps(iy1,jy0);
1461	dz10 = _mm_sub_ps(iz1,jz0);
1462	dx11 = _mm_sub_ps(ix1,jx1);
1463	dy11 = _mm_sub_ps(iy1,jy1);
1464	dz11 = _mm_sub_ps(iz1,jz1);
1465	dx12 = _mm_sub_ps(ix1,jx2);
1466	dy12 = _mm_sub_ps(iy1,jy2);
1467	dz12 = _mm_sub_ps(iz1,jz2);
1468	dx20 = _mm_sub_ps(ix2,jx0);
1469	dy20 = _mm_sub_ps(iy2,jy0);
1470	dz20 = _mm_sub_ps(iz2,jz0);
1471	dx21 = _mm_sub_ps(ix2,jx1);
1472	dy21 = _mm_sub_ps(iy2,jy1);
1473	dz21 = _mm_sub_ps(iz2,jz1);
1474	dx22 = _mm_sub_ps(ix2,jx2);
1475	dy22 = _mm_sub_ps(iy2,jy2);
1476	dz22 = _mm_sub_ps(iz2,jz2);
1477
1478	/* Calculate squared distance and things based on it */
1479	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1480	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1481	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1482	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1483	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1484	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1485	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1486	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1487	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1488
1489	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1490	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1491	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1492	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1493	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1494	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1495	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1496	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1497	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1498
1499	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1500
1501	fjx0 = _mm_setzero_ps();
1502	fjy0 = _mm_setzero_ps();
1503	fjz0 = _mm_setzero_ps();
1504	fjx1 = _mm_setzero_ps();
1505	fjy1 = _mm_setzero_ps();
1506	fjz1 = _mm_setzero_ps();
1507	fjx2 = _mm_setzero_ps();
1508	fjy2 = _mm_setzero_ps();
1509	fjz2 = _mm_setzero_ps();
1510
1511	/**************************
1512	* CALCULATE INTERACTIONS *
1513	**************************/
1514
1515	r00 = _mm_mul_ps(rsq00,rinv00);
1516
1517	/* Calculate table index by multiplying r with table scale and truncate to integer */
1518	rt = _mm_mul_ps(r00,vftabscale);
1519	vfitab = _mm_cvttps_epi32(rt);
1520	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1521	vfitab = _mm_slli_epi32(vfitab,2);
1522
1523	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1524	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1525	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1526	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1527	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1528	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1529	Heps = _mm_mul_ps(vfeps,H);
1530	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1531	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1532	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1533
1534	/* LENNARD-JONES DISPERSION/REPULSION */
1535
1536	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1537	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1538
1539	fscal = _mm_add_ps(felec,fvdw);
1540
1541	/* Calculate temporary vectorial force */
1542	tx = _mm_mul_ps(fscal,dx00);
1543	ty = _mm_mul_ps(fscal,dy00);
1544	tz = _mm_mul_ps(fscal,dz00);
1545
1546	/* Update vectorial force */
1547	fix0 = _mm_add_ps(fix0,tx);
1548	fiy0 = _mm_add_ps(fiy0,ty);
1549	fiz0 = _mm_add_ps(fiz0,tz);
1550
1551	fjx0 = _mm_add_ps(fjx0,tx);
1552	fjy0 = _mm_add_ps(fjy0,ty);
1553	fjz0 = _mm_add_ps(fjz0,tz);
1554
1555	/**************************
1556	* CALCULATE INTERACTIONS *
1557	**************************/
1558
1559	r01 = _mm_mul_ps(rsq01,rinv01);
1560
1561	/* Calculate table index by multiplying r with table scale and truncate to integer */
1562	rt = _mm_mul_ps(r01,vftabscale);
1563	vfitab = _mm_cvttps_epi32(rt);
1564	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1565	vfitab = _mm_slli_epi32(vfitab,2);
1566
1567	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1568	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1569	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1570	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1571	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1572	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1573	Heps = _mm_mul_ps(vfeps,H);
1574	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1575	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1576	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1577
1578	fscal = felec;
1579
1580	/* Calculate temporary vectorial force */
1581	tx = _mm_mul_ps(fscal,dx01);
1582	ty = _mm_mul_ps(fscal,dy01);
1583	tz = _mm_mul_ps(fscal,dz01);
1584
1585	/* Update vectorial force */
1586	fix0 = _mm_add_ps(fix0,tx);
1587	fiy0 = _mm_add_ps(fiy0,ty);
1588	fiz0 = _mm_add_ps(fiz0,tz);
1589
1590	fjx1 = _mm_add_ps(fjx1,tx);
1591	fjy1 = _mm_add_ps(fjy1,ty);
1592	fjz1 = _mm_add_ps(fjz1,tz);
1593
1594	/**************************
1595	* CALCULATE INTERACTIONS *
1596	**************************/
1597
1598	r02 = _mm_mul_ps(rsq02,rinv02);
1599
1600	/* Calculate table index by multiplying r with table scale and truncate to integer */
1601	rt = _mm_mul_ps(r02,vftabscale);
1602	vfitab = _mm_cvttps_epi32(rt);
1603	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1604	vfitab = _mm_slli_epi32(vfitab,2);
1605
1606	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1607	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1608	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1609	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1610	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1611	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1612	Heps = _mm_mul_ps(vfeps,H);
1613	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1614	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1615	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1616
1617	fscal = felec;
1618
1619	/* Calculate temporary vectorial force */
1620	tx = _mm_mul_ps(fscal,dx02);
1621	ty = _mm_mul_ps(fscal,dy02);
1622	tz = _mm_mul_ps(fscal,dz02);
1623
1624	/* Update vectorial force */
1625	fix0 = _mm_add_ps(fix0,tx);
1626	fiy0 = _mm_add_ps(fiy0,ty);
1627	fiz0 = _mm_add_ps(fiz0,tz);
1628
1629	fjx2 = _mm_add_ps(fjx2,tx);
1630	fjy2 = _mm_add_ps(fjy2,ty);
1631	fjz2 = _mm_add_ps(fjz2,tz);
1632
1633	/**************************
1634	* CALCULATE INTERACTIONS *
1635	**************************/
1636
1637	r10 = _mm_mul_ps(rsq10,rinv10);
1638
1639	/* Calculate table index by multiplying r with table scale and truncate to integer */
1640	rt = _mm_mul_ps(r10,vftabscale);
1641	vfitab = _mm_cvttps_epi32(rt);
1642	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1643	vfitab = _mm_slli_epi32(vfitab,2);
1644
1645	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1646	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1647	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1648	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1649	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1650	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1651	Heps = _mm_mul_ps(vfeps,H);
1652	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1653	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1654	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1655
1656	fscal = felec;
1657
1658	/* Calculate temporary vectorial force */
1659	tx = _mm_mul_ps(fscal,dx10);
1660	ty = _mm_mul_ps(fscal,dy10);
1661	tz = _mm_mul_ps(fscal,dz10);
1662
1663	/* Update vectorial force */
1664	fix1 = _mm_add_ps(fix1,tx);
1665	fiy1 = _mm_add_ps(fiy1,ty);
1666	fiz1 = _mm_add_ps(fiz1,tz);
1667
1668	fjx0 = _mm_add_ps(fjx0,tx);
1669	fjy0 = _mm_add_ps(fjy0,ty);
1670	fjz0 = _mm_add_ps(fjz0,tz);
1671
1672	/**************************
1673	* CALCULATE INTERACTIONS *
1674	**************************/
1675
1676	r11 = _mm_mul_ps(rsq11,rinv11);
1677
1678	/* Calculate table index by multiplying r with table scale and truncate to integer */
1679	rt = _mm_mul_ps(r11,vftabscale);
1680	vfitab = _mm_cvttps_epi32(rt);
1681	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1682	vfitab = _mm_slli_epi32(vfitab,2);
1683
1684	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1685	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1686	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1687	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1688	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1689	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1690	Heps = _mm_mul_ps(vfeps,H);
1691	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1692	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1693	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1694
1695	fscal = felec;
1696
1697	/* Calculate temporary vectorial force */
1698	tx = _mm_mul_ps(fscal,dx11);
1699	ty = _mm_mul_ps(fscal,dy11);
1700	tz = _mm_mul_ps(fscal,dz11);
1701
1702	/* Update vectorial force */
1703	fix1 = _mm_add_ps(fix1,tx);
1704	fiy1 = _mm_add_ps(fiy1,ty);
1705	fiz1 = _mm_add_ps(fiz1,tz);
1706
1707	fjx1 = _mm_add_ps(fjx1,tx);
1708	fjy1 = _mm_add_ps(fjy1,ty);
1709	fjz1 = _mm_add_ps(fjz1,tz);
1710
1711	/**************************
1712	* CALCULATE INTERACTIONS *
1713	**************************/
1714
1715	r12 = _mm_mul_ps(rsq12,rinv12);
1716
1717	/* Calculate table index by multiplying r with table scale and truncate to integer */
1718	rt = _mm_mul_ps(r12,vftabscale);
1719	vfitab = _mm_cvttps_epi32(rt);
1720	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1721	vfitab = _mm_slli_epi32(vfitab,2);
1722
1723	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1724	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1725	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1726	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1727	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1728	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1729	Heps = _mm_mul_ps(vfeps,H);
1730	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1731	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1732	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1733
1734	fscal = felec;
1735
1736	/* Calculate temporary vectorial force */
1737	tx = _mm_mul_ps(fscal,dx12);
1738	ty = _mm_mul_ps(fscal,dy12);
1739	tz = _mm_mul_ps(fscal,dz12);
1740
1741	/* Update vectorial force */
1742	fix1 = _mm_add_ps(fix1,tx);
1743	fiy1 = _mm_add_ps(fiy1,ty);
1744	fiz1 = _mm_add_ps(fiz1,tz);
1745
1746	fjx2 = _mm_add_ps(fjx2,tx);
1747	fjy2 = _mm_add_ps(fjy2,ty);
1748	fjz2 = _mm_add_ps(fjz2,tz);
1749
1750	/**************************
1751	* CALCULATE INTERACTIONS *
1752	**************************/
1753
1754	r20 = _mm_mul_ps(rsq20,rinv20);
1755
1756	/* Calculate table index by multiplying r with table scale and truncate to integer */
1757	rt = _mm_mul_ps(r20,vftabscale);
1758	vfitab = _mm_cvttps_epi32(rt);
1759	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1760	vfitab = _mm_slli_epi32(vfitab,2);
1761
1762	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1763	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1764	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1765	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1766	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1767	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1768	Heps = _mm_mul_ps(vfeps,H);
1769	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1770	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1771	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1772
1773	fscal = felec;
1774
1775	/* Calculate temporary vectorial force */
1776	tx = _mm_mul_ps(fscal,dx20);
1777	ty = _mm_mul_ps(fscal,dy20);
1778	tz = _mm_mul_ps(fscal,dz20);
1779
1780	/* Update vectorial force */
1781	fix2 = _mm_add_ps(fix2,tx);
1782	fiy2 = _mm_add_ps(fiy2,ty);
1783	fiz2 = _mm_add_ps(fiz2,tz);
1784
1785	fjx0 = _mm_add_ps(fjx0,tx);
1786	fjy0 = _mm_add_ps(fjy0,ty);
1787	fjz0 = _mm_add_ps(fjz0,tz);
1788
1789	/**************************
1790	* CALCULATE INTERACTIONS *
1791	**************************/
1792
1793	r21 = _mm_mul_ps(rsq21,rinv21);
1794
1795	/* Calculate table index by multiplying r with table scale and truncate to integer */
1796	rt = _mm_mul_ps(r21,vftabscale);
1797	vfitab = _mm_cvttps_epi32(rt);
1798	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1799	vfitab = _mm_slli_epi32(vfitab,2);
1800
1801	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1802	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1803	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1804	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1805	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1806	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1807	Heps = _mm_mul_ps(vfeps,H);
1808	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1809	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1810	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1811
1812	fscal = felec;
1813
1814	/* Calculate temporary vectorial force */
1815	tx = _mm_mul_ps(fscal,dx21);
1816	ty = _mm_mul_ps(fscal,dy21);
1817	tz = _mm_mul_ps(fscal,dz21);
1818
1819	/* Update vectorial force */
1820	fix2 = _mm_add_ps(fix2,tx);
1821	fiy2 = _mm_add_ps(fiy2,ty);
1822	fiz2 = _mm_add_ps(fiz2,tz);
1823
1824	fjx1 = _mm_add_ps(fjx1,tx);
1825	fjy1 = _mm_add_ps(fjy1,ty);
1826	fjz1 = _mm_add_ps(fjz1,tz);
1827
1828	/**************************
1829	* CALCULATE INTERACTIONS *
1830	**************************/
1831
1832	r22 = _mm_mul_ps(rsq22,rinv22);
1833
1834	/* Calculate table index by multiplying r with table scale and truncate to integer */
1835	rt = _mm_mul_ps(r22,vftabscale);
1836	vfitab = _mm_cvttps_epi32(rt);
1837	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1838	vfitab = _mm_slli_epi32(vfitab,2);
1839
1840	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1841	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1842	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1843	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1844	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1845	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1846	Heps = _mm_mul_ps(vfeps,H);
1847	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1848	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1849	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1850
1851	fscal = felec;
1852
1853	/* Calculate temporary vectorial force */
1854	tx = _mm_mul_ps(fscal,dx22);
1855	ty = _mm_mul_ps(fscal,dy22);
1856	tz = _mm_mul_ps(fscal,dz22);
1857
1858	/* Update vectorial force */
1859	fix2 = _mm_add_ps(fix2,tx);
1860	fiy2 = _mm_add_ps(fiy2,ty);
1861	fiz2 = _mm_add_ps(fiz2,tz);
1862
1863	fjx2 = _mm_add_ps(fjx2,tx);
1864	fjy2 = _mm_add_ps(fjy2,ty);
1865	fjz2 = _mm_add_ps(fjz2,tz);
1866
1867	fjptrA = f+j_coord_offsetA;
1868	fjptrB = f+j_coord_offsetB;
1869	fjptrC = f+j_coord_offsetC;
1870	fjptrD = f+j_coord_offsetD;
1871
1872	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1873	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1874
1875	/* Inner loop uses 359 flops */
1876	}
1877
1878	if(jidx<j_index_end)
1879	{
1880
1881	/* Get j neighbor index, and coordinate index */
1882	jnrlistA = jjnr[jidx];
1883	jnrlistB = jjnr[jidx+1];
1884	jnrlistC = jjnr[jidx+2];
1885	jnrlistD = jjnr[jidx+3];
1886	/* Sign of each element will be negative for non-real atoms.
1887	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1888	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1889	*/
1890	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1891	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1892	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1893	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1894	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1895	j_coord_offsetA = DIM3*jnrA;
1896	j_coord_offsetB = DIM3*jnrB;
1897	j_coord_offsetC = DIM3*jnrC;
1898	j_coord_offsetD = DIM3*jnrD;
1899
1900	/* load j atom coordinates */
1901	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1902	x+j_coord_offsetC,x+j_coord_offsetD,
1903	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1904
1905	/* Calculate displacement vector */
1906	dx00 = _mm_sub_ps(ix0,jx0);
1907	dy00 = _mm_sub_ps(iy0,jy0);
1908	dz00 = _mm_sub_ps(iz0,jz0);
1909	dx01 = _mm_sub_ps(ix0,jx1);
1910	dy01 = _mm_sub_ps(iy0,jy1);
1911	dz01 = _mm_sub_ps(iz0,jz1);
1912	dx02 = _mm_sub_ps(ix0,jx2);
1913	dy02 = _mm_sub_ps(iy0,jy2);
1914	dz02 = _mm_sub_ps(iz0,jz2);
1915	dx10 = _mm_sub_ps(ix1,jx0);
1916	dy10 = _mm_sub_ps(iy1,jy0);
1917	dz10 = _mm_sub_ps(iz1,jz0);
1918	dx11 = _mm_sub_ps(ix1,jx1);
1919	dy11 = _mm_sub_ps(iy1,jy1);
1920	dz11 = _mm_sub_ps(iz1,jz1);
1921	dx12 = _mm_sub_ps(ix1,jx2);
1922	dy12 = _mm_sub_ps(iy1,jy2);
1923	dz12 = _mm_sub_ps(iz1,jz2);
1924	dx20 = _mm_sub_ps(ix2,jx0);
1925	dy20 = _mm_sub_ps(iy2,jy0);
1926	dz20 = _mm_sub_ps(iz2,jz0);
1927	dx21 = _mm_sub_ps(ix2,jx1);
1928	dy21 = _mm_sub_ps(iy2,jy1);
1929	dz21 = _mm_sub_ps(iz2,jz1);
1930	dx22 = _mm_sub_ps(ix2,jx2);
1931	dy22 = _mm_sub_ps(iy2,jy2);
1932	dz22 = _mm_sub_ps(iz2,jz2);
1933
1934	/* Calculate squared distance and things based on it */
1935	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1936	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1937	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1938	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1939	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1940	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1941	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1942	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1943	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1944
1945	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1946	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1947	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1948	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1949	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1950	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1951	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1952	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1953	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1954
1955	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1956
1957	fjx0 = _mm_setzero_ps();
1958	fjy0 = _mm_setzero_ps();
1959	fjz0 = _mm_setzero_ps();
1960	fjx1 = _mm_setzero_ps();
1961	fjy1 = _mm_setzero_ps();
1962	fjz1 = _mm_setzero_ps();
1963	fjx2 = _mm_setzero_ps();
1964	fjy2 = _mm_setzero_ps();
1965	fjz2 = _mm_setzero_ps();
1966
1967	/**************************
1968	* CALCULATE INTERACTIONS *
1969	**************************/
1970
1971	r00 = _mm_mul_ps(rsq00,rinv00);
1972	r00 = _mm_andnot_ps(dummy_mask,r00);
1973
1974	/* Calculate table index by multiplying r with table scale and truncate to integer */
1975	rt = _mm_mul_ps(r00,vftabscale);
1976	vfitab = _mm_cvttps_epi32(rt);
1977	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1978	vfitab = _mm_slli_epi32(vfitab,2);
1979
1980	/* CUBIC SPLINE TABLE ELECTROSTATICS */
1981	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
1982	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
1983	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
1984	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
1985	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
1986	Heps = _mm_mul_ps(vfeps,H);
1987	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1988	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1989	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1990
1991	/* LENNARD-JONES DISPERSION/REPULSION */
1992
1993	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1994	fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1995
1996	fscal = _mm_add_ps(felec,fvdw);
1997
1998	fscal = _mm_andnot_ps(dummy_mask,fscal);
1999
2000	/* Calculate temporary vectorial force */
2001	tx = _mm_mul_ps(fscal,dx00);
2002	ty = _mm_mul_ps(fscal,dy00);
2003	tz = _mm_mul_ps(fscal,dz00);
2004
2005	/* Update vectorial force */
2006	fix0 = _mm_add_ps(fix0,tx);
2007	fiy0 = _mm_add_ps(fiy0,ty);
2008	fiz0 = _mm_add_ps(fiz0,tz);
2009
2010	fjx0 = _mm_add_ps(fjx0,tx);
2011	fjy0 = _mm_add_ps(fjy0,ty);
2012	fjz0 = _mm_add_ps(fjz0,tz);
2013
2014	/**************************
2015	* CALCULATE INTERACTIONS *
2016	**************************/
2017
2018	r01 = _mm_mul_ps(rsq01,rinv01);
2019	r01 = _mm_andnot_ps(dummy_mask,r01);
2020
2021	/* Calculate table index by multiplying r with table scale and truncate to integer */
2022	rt = _mm_mul_ps(r01,vftabscale);
2023	vfitab = _mm_cvttps_epi32(rt);
2024	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2025	vfitab = _mm_slli_epi32(vfitab,2);
2026
2027	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2028	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2029	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2030	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2031	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2032	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2033	Heps = _mm_mul_ps(vfeps,H);
2034	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2035	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2036	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2037
2038	fscal = felec;
2039
2040	fscal = _mm_andnot_ps(dummy_mask,fscal);
2041
2042	/* Calculate temporary vectorial force */
2043	tx = _mm_mul_ps(fscal,dx01);
2044	ty = _mm_mul_ps(fscal,dy01);
2045	tz = _mm_mul_ps(fscal,dz01);
2046
2047	/* Update vectorial force */
2048	fix0 = _mm_add_ps(fix0,tx);
2049	fiy0 = _mm_add_ps(fiy0,ty);
2050	fiz0 = _mm_add_ps(fiz0,tz);
2051
2052	fjx1 = _mm_add_ps(fjx1,tx);
2053	fjy1 = _mm_add_ps(fjy1,ty);
2054	fjz1 = _mm_add_ps(fjz1,tz);
2055
2056	/**************************
2057	* CALCULATE INTERACTIONS *
2058	**************************/
2059
2060	r02 = _mm_mul_ps(rsq02,rinv02);
2061	r02 = _mm_andnot_ps(dummy_mask,r02);
2062
2063	/* Calculate table index by multiplying r with table scale and truncate to integer */
2064	rt = _mm_mul_ps(r02,vftabscale);
2065	vfitab = _mm_cvttps_epi32(rt);
2066	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2067	vfitab = _mm_slli_epi32(vfitab,2);
2068
2069	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2070	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2071	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2072	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2073	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2074	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2075	Heps = _mm_mul_ps(vfeps,H);
2076	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2077	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2078	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2079
2080	fscal = felec;
2081
2082	fscal = _mm_andnot_ps(dummy_mask,fscal);
2083
2084	/* Calculate temporary vectorial force */
2085	tx = _mm_mul_ps(fscal,dx02);
2086	ty = _mm_mul_ps(fscal,dy02);
2087	tz = _mm_mul_ps(fscal,dz02);
2088
2089	/* Update vectorial force */
2090	fix0 = _mm_add_ps(fix0,tx);
2091	fiy0 = _mm_add_ps(fiy0,ty);
2092	fiz0 = _mm_add_ps(fiz0,tz);
2093
2094	fjx2 = _mm_add_ps(fjx2,tx);
2095	fjy2 = _mm_add_ps(fjy2,ty);
2096	fjz2 = _mm_add_ps(fjz2,tz);
2097
2098	/**************************
2099	* CALCULATE INTERACTIONS *
2100	**************************/
2101
2102	r10 = _mm_mul_ps(rsq10,rinv10);
2103	r10 = _mm_andnot_ps(dummy_mask,r10);
2104
2105	/* Calculate table index by multiplying r with table scale and truncate to integer */
2106	rt = _mm_mul_ps(r10,vftabscale);
2107	vfitab = _mm_cvttps_epi32(rt);
2108	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2109	vfitab = _mm_slli_epi32(vfitab,2);
2110
2111	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2112	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2113	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2114	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2115	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2116	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2117	Heps = _mm_mul_ps(vfeps,H);
2118	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2119	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2120	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2121
2122	fscal = felec;
2123
2124	fscal = _mm_andnot_ps(dummy_mask,fscal);
2125
2126	/* Calculate temporary vectorial force */
2127	tx = _mm_mul_ps(fscal,dx10);
2128	ty = _mm_mul_ps(fscal,dy10);
2129	tz = _mm_mul_ps(fscal,dz10);
2130
2131	/* Update vectorial force */
2132	fix1 = _mm_add_ps(fix1,tx);
2133	fiy1 = _mm_add_ps(fiy1,ty);
2134	fiz1 = _mm_add_ps(fiz1,tz);
2135
2136	fjx0 = _mm_add_ps(fjx0,tx);
2137	fjy0 = _mm_add_ps(fjy0,ty);
2138	fjz0 = _mm_add_ps(fjz0,tz);
2139
2140	/**************************
2141	* CALCULATE INTERACTIONS *
2142	**************************/
2143
2144	r11 = _mm_mul_ps(rsq11,rinv11);
2145	r11 = _mm_andnot_ps(dummy_mask,r11);
2146
2147	/* Calculate table index by multiplying r with table scale and truncate to integer */
2148	rt = _mm_mul_ps(r11,vftabscale);
2149	vfitab = _mm_cvttps_epi32(rt);
2150	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2151	vfitab = _mm_slli_epi32(vfitab,2);
2152
2153	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2154	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2155	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2156	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2157	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2158	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2159	Heps = _mm_mul_ps(vfeps,H);
2160	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2161	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2162	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2163
2164	fscal = felec;
2165
2166	fscal = _mm_andnot_ps(dummy_mask,fscal);
2167
2168	/* Calculate temporary vectorial force */
2169	tx = _mm_mul_ps(fscal,dx11);
2170	ty = _mm_mul_ps(fscal,dy11);
2171	tz = _mm_mul_ps(fscal,dz11);
2172
2173	/* Update vectorial force */
2174	fix1 = _mm_add_ps(fix1,tx);
2175	fiy1 = _mm_add_ps(fiy1,ty);
2176	fiz1 = _mm_add_ps(fiz1,tz);
2177
2178	fjx1 = _mm_add_ps(fjx1,tx);
2179	fjy1 = _mm_add_ps(fjy1,ty);
2180	fjz1 = _mm_add_ps(fjz1,tz);
2181
2182	/**************************
2183	* CALCULATE INTERACTIONS *
2184	**************************/
2185
2186	r12 = _mm_mul_ps(rsq12,rinv12);
2187	r12 = _mm_andnot_ps(dummy_mask,r12);
2188
2189	/* Calculate table index by multiplying r with table scale and truncate to integer */
2190	rt = _mm_mul_ps(r12,vftabscale);
2191	vfitab = _mm_cvttps_epi32(rt);
2192	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2193	vfitab = _mm_slli_epi32(vfitab,2);
2194
2195	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2196	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2197	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2198	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2199	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2200	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2201	Heps = _mm_mul_ps(vfeps,H);
2202	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2203	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2204	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2205
2206	fscal = felec;
2207
2208	fscal = _mm_andnot_ps(dummy_mask,fscal);
2209
2210	/* Calculate temporary vectorial force */
2211	tx = _mm_mul_ps(fscal,dx12);
2212	ty = _mm_mul_ps(fscal,dy12);
2213	tz = _mm_mul_ps(fscal,dz12);
2214
2215	/* Update vectorial force */
2216	fix1 = _mm_add_ps(fix1,tx);
2217	fiy1 = _mm_add_ps(fiy1,ty);
2218	fiz1 = _mm_add_ps(fiz1,tz);
2219
2220	fjx2 = _mm_add_ps(fjx2,tx);
2221	fjy2 = _mm_add_ps(fjy2,ty);
2222	fjz2 = _mm_add_ps(fjz2,tz);
2223
2224	/**************************
2225	* CALCULATE INTERACTIONS *
2226	**************************/
2227
2228	r20 = _mm_mul_ps(rsq20,rinv20);
2229	r20 = _mm_andnot_ps(dummy_mask,r20);
2230
2231	/* Calculate table index by multiplying r with table scale and truncate to integer */
2232	rt = _mm_mul_ps(r20,vftabscale);
2233	vfitab = _mm_cvttps_epi32(rt);
2234	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2235	vfitab = _mm_slli_epi32(vfitab,2);
2236
2237	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2238	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2239	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2240	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2241	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2242	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2243	Heps = _mm_mul_ps(vfeps,H);
2244	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2245	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2246	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2247
2248	fscal = felec;
2249
2250	fscal = _mm_andnot_ps(dummy_mask,fscal);
2251
2252	/* Calculate temporary vectorial force */
2253	tx = _mm_mul_ps(fscal,dx20);
2254	ty = _mm_mul_ps(fscal,dy20);
2255	tz = _mm_mul_ps(fscal,dz20);
2256
2257	/* Update vectorial force */
2258	fix2 = _mm_add_ps(fix2,tx);
2259	fiy2 = _mm_add_ps(fiy2,ty);
2260	fiz2 = _mm_add_ps(fiz2,tz);
2261
2262	fjx0 = _mm_add_ps(fjx0,tx);
2263	fjy0 = _mm_add_ps(fjy0,ty);
2264	fjz0 = _mm_add_ps(fjz0,tz);
2265
2266	/**************************
2267	* CALCULATE INTERACTIONS *
2268	**************************/
2269
2270	r21 = _mm_mul_ps(rsq21,rinv21);
2271	r21 = _mm_andnot_ps(dummy_mask,r21);
2272
2273	/* Calculate table index by multiplying r with table scale and truncate to integer */
2274	rt = _mm_mul_ps(r21,vftabscale);
2275	vfitab = _mm_cvttps_epi32(rt);
2276	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2277	vfitab = _mm_slli_epi32(vfitab,2);
2278
2279	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2280	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2281	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2282	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2283	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2284	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2285	Heps = _mm_mul_ps(vfeps,H);
2286	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2287	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2288	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2289
2290	fscal = felec;
2291
2292	fscal = _mm_andnot_ps(dummy_mask,fscal);
2293
2294	/* Calculate temporary vectorial force */
2295	tx = _mm_mul_ps(fscal,dx21);
2296	ty = _mm_mul_ps(fscal,dy21);
2297	tz = _mm_mul_ps(fscal,dz21);
2298
2299	/* Update vectorial force */
2300	fix2 = _mm_add_ps(fix2,tx);
2301	fiy2 = _mm_add_ps(fiy2,ty);
2302	fiz2 = _mm_add_ps(fiz2,tz);
2303
2304	fjx1 = _mm_add_ps(fjx1,tx);
2305	fjy1 = _mm_add_ps(fjy1,ty);
2306	fjz1 = _mm_add_ps(fjz1,tz);
2307
2308	/**************************
2309	* CALCULATE INTERACTIONS *
2310	**************************/
2311
2312	r22 = _mm_mul_ps(rsq22,rinv22);
2313	r22 = _mm_andnot_ps(dummy_mask,r22);
2314
2315	/* Calculate table index by multiplying r with table scale and truncate to integer */
2316	rt = _mm_mul_ps(r22,vftabscale);
2317	vfitab = _mm_cvttps_epi32(rt);
2318	vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (rt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2319	vfitab = _mm_slli_epi32(vfitab,2);
2320
2321	/* CUBIC SPLINE TABLE ELECTROSTATICS */
2322	Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(0) & 3];})) );
2323	F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(1) & 3];})) );
2324	G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(2) & 3];})) );
2325	H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3)(__extension__ ({ __v4si __a = (__v4si)(vfitab); __a[(3) & 3];})) );
2326	_MM_TRANSPOSE4_PS(Y,F,G,H)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((Y ), (F)); tmp2 = _mm_unpacklo_ps((G), (H)); tmp1 = _mm_unpackhi_ps ((Y), (F)); tmp3 = _mm_unpackhi_ps((G), (H)); (Y) = _mm_movelh_ps (tmp0, tmp2); (F) = _mm_movehl_ps(tmp2, tmp0); (G) = _mm_movelh_ps (tmp1, tmp3); (H) = _mm_movehl_ps(tmp3, tmp1); } while (0);
2327	Heps = _mm_mul_ps(vfeps,H);
2328	Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2329	FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2330	felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2331
2332	fscal = felec;
2333
2334	fscal = _mm_andnot_ps(dummy_mask,fscal);
2335
2336	/* Calculate temporary vectorial force */
2337	tx = _mm_mul_ps(fscal,dx22);
2338	ty = _mm_mul_ps(fscal,dy22);
2339	tz = _mm_mul_ps(fscal,dz22);
2340
2341	/* Update vectorial force */
2342	fix2 = _mm_add_ps(fix2,tx);
2343	fiy2 = _mm_add_ps(fiy2,ty);
2344	fiz2 = _mm_add_ps(fiz2,tz);
2345
2346	fjx2 = _mm_add_ps(fjx2,tx);
2347	fjy2 = _mm_add_ps(fjy2,ty);
2348	fjz2 = _mm_add_ps(fjz2,tz);
2349
2350	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2351	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2352	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2353	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2354
2355	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2356	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2357
2358	/* Inner loop uses 368 flops */
2359	}
2360
2361	/* End of innermost loop */
2362
2363	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2364	f+i_coord_offset,fshift+i_shift_offset);
2365
2366	/* Increment number of inner iterations */
2367	inneriter += j_index_end - j_index_start;
2368
2369	/* Outer loop uses 18 flops */
2370	}
2371
2372	/* Increment number of outer iterations */
2373	outeriter += nri;
2374
2375	/* Update outer/inner flops */
2376
2377	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter18 + inneriter368)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter18 + inneriter 368;
2378	}