/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse4_1_single.c
Location:	line 188, column 5
Description:	Value stored to 'j_coord_offsetB' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LennardJones
56	* Geometry: Water3-Water3
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
107	real *charge;
108	int nvdwtype;
109	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110	int *vdwtype;
111	real *vdwparam;
112	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
113	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114	__m128i ewitab;
115	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116	real *ewtab;
117	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118	real rswitch_scalar,d_scalar;
119	__m128 dummy_mask,cutoff_mask;
120	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121	__m128 one = _mm_set1_ps(1.0);
122	__m128 two = _mm_set1_ps(2.0);
123	x = xx[0];
124	f = ff[0];
125
126	nri = nlist->nri;
127	iinr = nlist->iinr;
128	jindex = nlist->jindex;
129	jjnr = nlist->jjnr;
130	shiftidx = nlist->shift;
131	gid = nlist->gid;
132	shiftvec = fr->shift_vec[0];
133	fshift = fr->fshift[0];
134	facel = _mm_set1_ps(fr->epsfac);
135	charge = mdatoms->chargeA;
136	nvdwtype = fr->ntype;
137	vdwparam = fr->nbfp;
138	vdwtype = mdatoms->typeA;
139
140	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
141	ewtab = fr->ic->tabq_coul_FDV0;
142	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
143	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
144
145	/* Setup water-specific parameters */
146	inr = nlist->iinr[0];
147	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
148	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
151
152	jq0 = _mm_set1_ps(charge[inr+0]);
153	jq1 = _mm_set1_ps(charge[inr+1]);
154	jq2 = _mm_set1_ps(charge[inr+2]);
155	vdwjidx0A = 2*vdwtype[inr+0];
156	qq00 = _mm_mul_ps(iq0,jq0);
157	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159	qq01 = _mm_mul_ps(iq0,jq1);
160	qq02 = _mm_mul_ps(iq0,jq2);
161	qq10 = _mm_mul_ps(iq1,jq0);
162	qq11 = _mm_mul_ps(iq1,jq1);
163	qq12 = _mm_mul_ps(iq1,jq2);
164	qq20 = _mm_mul_ps(iq2,jq0);
165	qq21 = _mm_mul_ps(iq2,jq1);
166	qq22 = _mm_mul_ps(iq2,jq2);
167
168	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169	rcutoff_scalar = fr->rcoulomb;
170	rcutoff = _mm_set1_ps(rcutoff_scalar);
171	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
172
173	rswitch_scalar = fr->rcoulomb_switch;
174	rswitch = _mm_set1_ps(rswitch_scalar);
175	/* Setup switch parameters */
176	d_scalar = rcutoff_scalar-rswitch_scalar;
177	d = _mm_set1_ps(d_scalar);
178	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
179	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
180	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
181	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
182	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
183	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
184
185	/* Avoid stupid compiler warnings */
186	jnrA = jnrB = jnrC = jnrD = 0;
187	j_coord_offsetA = 0;
188	j_coord_offsetB = 0;
	Value stored to 'j_coord_offsetB' is never read
189	j_coord_offsetC = 0;
190	j_coord_offsetD = 0;
191
192	outeriter = 0;
193	inneriter = 0;
194
195	for(iidx=0;iidx<4*DIM3;iidx++)
196	{
197	scratch[iidx] = 0.0;
198	}
199
200	/* Start outer loop over neighborlists */
201	for(iidx=0; iidx<nri; iidx++)
202	{
203	/* Load shift vector for this list */
204	i_shift_offset = DIM3*shiftidx[iidx];
205
206	/* Load limits for loop over neighbors */
207	j_index_start = jindex[iidx];
208	j_index_end = jindex[iidx+1];
209
210	/* Get outer coordinate index */
211	inr = iinr[iidx];
212	i_coord_offset = DIM3*inr;
213
214	/* Load i particle coords and add shift vector */
215	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
216	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
217
218	fix0 = _mm_setzero_ps();
219	fiy0 = _mm_setzero_ps();
220	fiz0 = _mm_setzero_ps();
221	fix1 = _mm_setzero_ps();
222	fiy1 = _mm_setzero_ps();
223	fiz1 = _mm_setzero_ps();
224	fix2 = _mm_setzero_ps();
225	fiy2 = _mm_setzero_ps();
226	fiz2 = _mm_setzero_ps();
227
228	/* Reset potential sums */
229	velecsum = _mm_setzero_ps();
230	vvdwsum = _mm_setzero_ps();
231
232	/* Start inner kernel loop */
233	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234	{
235
236	/* Get j neighbor index, and coordinate index */
237	jnrA = jjnr[jidx];
238	jnrB = jjnr[jidx+1];
239	jnrC = jjnr[jidx+2];
240	jnrD = jjnr[jidx+3];
241	j_coord_offsetA = DIM3*jnrA;
242	j_coord_offsetB = DIM3*jnrB;
243	j_coord_offsetC = DIM3*jnrC;
244	j_coord_offsetD = DIM3*jnrD;
245
246	/* load j atom coordinates */
247	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
248	x+j_coord_offsetC,x+j_coord_offsetD,
249	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
250
251	/* Calculate displacement vector */
252	dx00 = _mm_sub_ps(ix0,jx0);
253	dy00 = _mm_sub_ps(iy0,jy0);
254	dz00 = _mm_sub_ps(iz0,jz0);
255	dx01 = _mm_sub_ps(ix0,jx1);
256	dy01 = _mm_sub_ps(iy0,jy1);
257	dz01 = _mm_sub_ps(iz0,jz1);
258	dx02 = _mm_sub_ps(ix0,jx2);
259	dy02 = _mm_sub_ps(iy0,jy2);
260	dz02 = _mm_sub_ps(iz0,jz2);
261	dx10 = _mm_sub_ps(ix1,jx0);
262	dy10 = _mm_sub_ps(iy1,jy0);
263	dz10 = _mm_sub_ps(iz1,jz0);
264	dx11 = _mm_sub_ps(ix1,jx1);
265	dy11 = _mm_sub_ps(iy1,jy1);
266	dz11 = _mm_sub_ps(iz1,jz1);
267	dx12 = _mm_sub_ps(ix1,jx2);
268	dy12 = _mm_sub_ps(iy1,jy2);
269	dz12 = _mm_sub_ps(iz1,jz2);
270	dx20 = _mm_sub_ps(ix2,jx0);
271	dy20 = _mm_sub_ps(iy2,jy0);
272	dz20 = _mm_sub_ps(iz2,jz0);
273	dx21 = _mm_sub_ps(ix2,jx1);
274	dy21 = _mm_sub_ps(iy2,jy1);
275	dz21 = _mm_sub_ps(iz2,jz1);
276	dx22 = _mm_sub_ps(ix2,jx2);
277	dy22 = _mm_sub_ps(iy2,jy2);
278	dz22 = _mm_sub_ps(iz2,jz2);
279
280	/* Calculate squared distance and things based on it */
281	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
282	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
283	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
284	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
285	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
286	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
287	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
288	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
289	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
290
291	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
292	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
293	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
294	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
295	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
296	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
297	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
298	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
299	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
300
301	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
302	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
303	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
304	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
305	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
306	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
307	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
308	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
309	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
310
311	fjx0 = _mm_setzero_ps();
312	fjy0 = _mm_setzero_ps();
313	fjz0 = _mm_setzero_ps();
314	fjx1 = _mm_setzero_ps();
315	fjy1 = _mm_setzero_ps();
316	fjz1 = _mm_setzero_ps();
317	fjx2 = _mm_setzero_ps();
318	fjy2 = _mm_setzero_ps();
319	fjz2 = _mm_setzero_ps();
320
321	/**************************
322	* CALCULATE INTERACTIONS *
323	**************************/
324
325	if (gmx_mm_any_lt(rsq00,rcutoff2))
326	{
327
328	r00 = _mm_mul_ps(rsq00,rinv00);
329
330	/* EWALD ELECTROSTATICS */
331
332	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
333	ewrt = _mm_mul_ps(r00,ewtabscale);
334	ewitab = _mm_cvttps_epi32(ewrt);
335	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
336	ewitab = _mm_slli_epi32(ewitab,2);
337	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
338	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
339	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
340	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
341	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
342	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
343	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
344	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
345	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
346
347	/* LENNARD-JONES DISPERSION/REPULSION */
348
349	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
350	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
351	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
352	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
353	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
354
355	d = _mm_sub_ps(r00,rswitch);
356	d = _mm_max_ps(d,_mm_setzero_ps());
357	d2 = _mm_mul_ps(d,d);
358	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
359
360	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
361
362	/* Evaluate switch function */
363	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
364	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
365	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
366	velec = _mm_mul_ps(velec,sw);
367	vvdw = _mm_mul_ps(vvdw,sw);
368	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
369
370	/* Update potential sum for this i atom from the interaction with this j atom. */
371	velec = _mm_and_ps(velec,cutoff_mask);
372	velecsum = _mm_add_ps(velecsum,velec);
373	vvdw = _mm_and_ps(vvdw,cutoff_mask);
374	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
375
376	fscal = _mm_add_ps(felec,fvdw);
377
378	fscal = _mm_and_ps(fscal,cutoff_mask);
379
380	/* Calculate temporary vectorial force */
381	tx = _mm_mul_ps(fscal,dx00);
382	ty = _mm_mul_ps(fscal,dy00);
383	tz = _mm_mul_ps(fscal,dz00);
384
385	/* Update vectorial force */
386	fix0 = _mm_add_ps(fix0,tx);
387	fiy0 = _mm_add_ps(fiy0,ty);
388	fiz0 = _mm_add_ps(fiz0,tz);
389
390	fjx0 = _mm_add_ps(fjx0,tx);
391	fjy0 = _mm_add_ps(fjy0,ty);
392	fjz0 = _mm_add_ps(fjz0,tz);
393
394	}
395
396	/**************************
397	* CALCULATE INTERACTIONS *
398	**************************/
399
400	if (gmx_mm_any_lt(rsq01,rcutoff2))
401	{
402
403	r01 = _mm_mul_ps(rsq01,rinv01);
404
405	/* EWALD ELECTROSTATICS */
406
407	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408	ewrt = _mm_mul_ps(r01,ewtabscale);
409	ewitab = _mm_cvttps_epi32(ewrt);
410	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
411	ewitab = _mm_slli_epi32(ewitab,2);
412	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
413	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
414	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
415	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
416	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
417	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
418	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
419	velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
420	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
421
422	d = _mm_sub_ps(r01,rswitch);
423	d = _mm_max_ps(d,_mm_setzero_ps());
424	d2 = _mm_mul_ps(d,d);
425	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
426
427	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
428
429	/* Evaluate switch function */
430	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
431	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
432	velec = _mm_mul_ps(velec,sw);
433	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
434
435	/* Update potential sum for this i atom from the interaction with this j atom. */
436	velec = _mm_and_ps(velec,cutoff_mask);
437	velecsum = _mm_add_ps(velecsum,velec);
438
439	fscal = felec;
440
441	fscal = _mm_and_ps(fscal,cutoff_mask);
442
443	/* Calculate temporary vectorial force */
444	tx = _mm_mul_ps(fscal,dx01);
445	ty = _mm_mul_ps(fscal,dy01);
446	tz = _mm_mul_ps(fscal,dz01);
447
448	/* Update vectorial force */
449	fix0 = _mm_add_ps(fix0,tx);
450	fiy0 = _mm_add_ps(fiy0,ty);
451	fiz0 = _mm_add_ps(fiz0,tz);
452
453	fjx1 = _mm_add_ps(fjx1,tx);
454	fjy1 = _mm_add_ps(fjy1,ty);
455	fjz1 = _mm_add_ps(fjz1,tz);
456
457	}
458
459	/**************************
460	* CALCULATE INTERACTIONS *
461	**************************/
462
463	if (gmx_mm_any_lt(rsq02,rcutoff2))
464	{
465
466	r02 = _mm_mul_ps(rsq02,rinv02);
467
468	/* EWALD ELECTROSTATICS */
469
470	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471	ewrt = _mm_mul_ps(r02,ewtabscale);
472	ewitab = _mm_cvttps_epi32(ewrt);
473	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
474	ewitab = _mm_slli_epi32(ewitab,2);
475	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
476	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
477	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
478	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
479	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
480	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
481	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
482	velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
483	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
484
485	d = _mm_sub_ps(r02,rswitch);
486	d = _mm_max_ps(d,_mm_setzero_ps());
487	d2 = _mm_mul_ps(d,d);
488	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
489
490	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
491
492	/* Evaluate switch function */
493	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
494	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
495	velec = _mm_mul_ps(velec,sw);
496	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
497
498	/* Update potential sum for this i atom from the interaction with this j atom. */
499	velec = _mm_and_ps(velec,cutoff_mask);
500	velecsum = _mm_add_ps(velecsum,velec);
501
502	fscal = felec;
503
504	fscal = _mm_and_ps(fscal,cutoff_mask);
505
506	/* Calculate temporary vectorial force */
507	tx = _mm_mul_ps(fscal,dx02);
508	ty = _mm_mul_ps(fscal,dy02);
509	tz = _mm_mul_ps(fscal,dz02);
510
511	/* Update vectorial force */
512	fix0 = _mm_add_ps(fix0,tx);
513	fiy0 = _mm_add_ps(fiy0,ty);
514	fiz0 = _mm_add_ps(fiz0,tz);
515
516	fjx2 = _mm_add_ps(fjx2,tx);
517	fjy2 = _mm_add_ps(fjy2,ty);
518	fjz2 = _mm_add_ps(fjz2,tz);
519
520	}
521
522	/**************************
523	* CALCULATE INTERACTIONS *
524	**************************/
525
526	if (gmx_mm_any_lt(rsq10,rcutoff2))
527	{
528
529	r10 = _mm_mul_ps(rsq10,rinv10);
530
531	/* EWALD ELECTROSTATICS */
532
533	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
534	ewrt = _mm_mul_ps(r10,ewtabscale);
535	ewitab = _mm_cvttps_epi32(ewrt);
536	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
537	ewitab = _mm_slli_epi32(ewitab,2);
538	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
539	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
540	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
541	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
542	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
543	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
544	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
545	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
546	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
547
548	d = _mm_sub_ps(r10,rswitch);
549	d = _mm_max_ps(d,_mm_setzero_ps());
550	d2 = _mm_mul_ps(d,d);
551	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
552
553	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
554
555	/* Evaluate switch function */
556	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
557	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
558	velec = _mm_mul_ps(velec,sw);
559	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
560
561	/* Update potential sum for this i atom from the interaction with this j atom. */
562	velec = _mm_and_ps(velec,cutoff_mask);
563	velecsum = _mm_add_ps(velecsum,velec);
564
565	fscal = felec;
566
567	fscal = _mm_and_ps(fscal,cutoff_mask);
568
569	/* Calculate temporary vectorial force */
570	tx = _mm_mul_ps(fscal,dx10);
571	ty = _mm_mul_ps(fscal,dy10);
572	tz = _mm_mul_ps(fscal,dz10);
573
574	/* Update vectorial force */
575	fix1 = _mm_add_ps(fix1,tx);
576	fiy1 = _mm_add_ps(fiy1,ty);
577	fiz1 = _mm_add_ps(fiz1,tz);
578
579	fjx0 = _mm_add_ps(fjx0,tx);
580	fjy0 = _mm_add_ps(fjy0,ty);
581	fjz0 = _mm_add_ps(fjz0,tz);
582
583	}
584
585	/**************************
586	* CALCULATE INTERACTIONS *
587	**************************/
588
589	if (gmx_mm_any_lt(rsq11,rcutoff2))
590	{
591
592	r11 = _mm_mul_ps(rsq11,rinv11);
593
594	/* EWALD ELECTROSTATICS */
595
596	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
597	ewrt = _mm_mul_ps(r11,ewtabscale);
598	ewitab = _mm_cvttps_epi32(ewrt);
599	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
600	ewitab = _mm_slli_epi32(ewitab,2);
601	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
602	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
603	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
604	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
605	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
606	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
607	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
608	velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
609	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
610
611	d = _mm_sub_ps(r11,rswitch);
612	d = _mm_max_ps(d,_mm_setzero_ps());
613	d2 = _mm_mul_ps(d,d);
614	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
615
616	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
617
618	/* Evaluate switch function */
619	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
620	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
621	velec = _mm_mul_ps(velec,sw);
622	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
623
624	/* Update potential sum for this i atom from the interaction with this j atom. */
625	velec = _mm_and_ps(velec,cutoff_mask);
626	velecsum = _mm_add_ps(velecsum,velec);
627
628	fscal = felec;
629
630	fscal = _mm_and_ps(fscal,cutoff_mask);
631
632	/* Calculate temporary vectorial force */
633	tx = _mm_mul_ps(fscal,dx11);
634	ty = _mm_mul_ps(fscal,dy11);
635	tz = _mm_mul_ps(fscal,dz11);
636
637	/* Update vectorial force */
638	fix1 = _mm_add_ps(fix1,tx);
639	fiy1 = _mm_add_ps(fiy1,ty);
640	fiz1 = _mm_add_ps(fiz1,tz);
641
642	fjx1 = _mm_add_ps(fjx1,tx);
643	fjy1 = _mm_add_ps(fjy1,ty);
644	fjz1 = _mm_add_ps(fjz1,tz);
645
646	}
647
648	/**************************
649	* CALCULATE INTERACTIONS *
650	**************************/
651
652	if (gmx_mm_any_lt(rsq12,rcutoff2))
653	{
654
655	r12 = _mm_mul_ps(rsq12,rinv12);
656
657	/* EWALD ELECTROSTATICS */
658
659	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
660	ewrt = _mm_mul_ps(r12,ewtabscale);
661	ewitab = _mm_cvttps_epi32(ewrt);
662	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
663	ewitab = _mm_slli_epi32(ewitab,2);
664	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
665	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
666	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
667	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
668	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
669	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
670	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
671	velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
672	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
673
674	d = _mm_sub_ps(r12,rswitch);
675	d = _mm_max_ps(d,_mm_setzero_ps());
676	d2 = _mm_mul_ps(d,d);
677	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
678
679	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
680
681	/* Evaluate switch function */
682	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
683	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
684	velec = _mm_mul_ps(velec,sw);
685	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
686
687	/* Update potential sum for this i atom from the interaction with this j atom. */
688	velec = _mm_and_ps(velec,cutoff_mask);
689	velecsum = _mm_add_ps(velecsum,velec);
690
691	fscal = felec;
692
693	fscal = _mm_and_ps(fscal,cutoff_mask);
694
695	/* Calculate temporary vectorial force */
696	tx = _mm_mul_ps(fscal,dx12);
697	ty = _mm_mul_ps(fscal,dy12);
698	tz = _mm_mul_ps(fscal,dz12);
699
700	/* Update vectorial force */
701	fix1 = _mm_add_ps(fix1,tx);
702	fiy1 = _mm_add_ps(fiy1,ty);
703	fiz1 = _mm_add_ps(fiz1,tz);
704
705	fjx2 = _mm_add_ps(fjx2,tx);
706	fjy2 = _mm_add_ps(fjy2,ty);
707	fjz2 = _mm_add_ps(fjz2,tz);
708
709	}
710
711	/**************************
712	* CALCULATE INTERACTIONS *
713	**************************/
714
715	if (gmx_mm_any_lt(rsq20,rcutoff2))
716	{
717
718	r20 = _mm_mul_ps(rsq20,rinv20);
719
720	/* EWALD ELECTROSTATICS */
721
722	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
723	ewrt = _mm_mul_ps(r20,ewtabscale);
724	ewitab = _mm_cvttps_epi32(ewrt);
725	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
726	ewitab = _mm_slli_epi32(ewitab,2);
727	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
728	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
729	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
730	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
731	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
732	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
733	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
734	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
735	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
736
737	d = _mm_sub_ps(r20,rswitch);
738	d = _mm_max_ps(d,_mm_setzero_ps());
739	d2 = _mm_mul_ps(d,d);
740	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
741
742	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
743
744	/* Evaluate switch function */
745	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
746	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
747	velec = _mm_mul_ps(velec,sw);
748	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
749
750	/* Update potential sum for this i atom from the interaction with this j atom. */
751	velec = _mm_and_ps(velec,cutoff_mask);
752	velecsum = _mm_add_ps(velecsum,velec);
753
754	fscal = felec;
755
756	fscal = _mm_and_ps(fscal,cutoff_mask);
757
758	/* Calculate temporary vectorial force */
759	tx = _mm_mul_ps(fscal,dx20);
760	ty = _mm_mul_ps(fscal,dy20);
761	tz = _mm_mul_ps(fscal,dz20);
762
763	/* Update vectorial force */
764	fix2 = _mm_add_ps(fix2,tx);
765	fiy2 = _mm_add_ps(fiy2,ty);
766	fiz2 = _mm_add_ps(fiz2,tz);
767
768	fjx0 = _mm_add_ps(fjx0,tx);
769	fjy0 = _mm_add_ps(fjy0,ty);
770	fjz0 = _mm_add_ps(fjz0,tz);
771
772	}
773
774	/**************************
775	* CALCULATE INTERACTIONS *
776	**************************/
777
778	if (gmx_mm_any_lt(rsq21,rcutoff2))
779	{
780
781	r21 = _mm_mul_ps(rsq21,rinv21);
782
783	/* EWALD ELECTROSTATICS */
784
785	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
786	ewrt = _mm_mul_ps(r21,ewtabscale);
787	ewitab = _mm_cvttps_epi32(ewrt);
788	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
789	ewitab = _mm_slli_epi32(ewitab,2);
790	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
791	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
792	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
793	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
794	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
795	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
796	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
797	velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
798	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
799
800	d = _mm_sub_ps(r21,rswitch);
801	d = _mm_max_ps(d,_mm_setzero_ps());
802	d2 = _mm_mul_ps(d,d);
803	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
804
805	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
806
807	/* Evaluate switch function */
808	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
809	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
810	velec = _mm_mul_ps(velec,sw);
811	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
812
813	/* Update potential sum for this i atom from the interaction with this j atom. */
814	velec = _mm_and_ps(velec,cutoff_mask);
815	velecsum = _mm_add_ps(velecsum,velec);
816
817	fscal = felec;
818
819	fscal = _mm_and_ps(fscal,cutoff_mask);
820
821	/* Calculate temporary vectorial force */
822	tx = _mm_mul_ps(fscal,dx21);
823	ty = _mm_mul_ps(fscal,dy21);
824	tz = _mm_mul_ps(fscal,dz21);
825
826	/* Update vectorial force */
827	fix2 = _mm_add_ps(fix2,tx);
828	fiy2 = _mm_add_ps(fiy2,ty);
829	fiz2 = _mm_add_ps(fiz2,tz);
830
831	fjx1 = _mm_add_ps(fjx1,tx);
832	fjy1 = _mm_add_ps(fjy1,ty);
833	fjz1 = _mm_add_ps(fjz1,tz);
834
835	}
836
837	/**************************
838	* CALCULATE INTERACTIONS *
839	**************************/
840
841	if (gmx_mm_any_lt(rsq22,rcutoff2))
842	{
843
844	r22 = _mm_mul_ps(rsq22,rinv22);
845
846	/* EWALD ELECTROSTATICS */
847
848	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
849	ewrt = _mm_mul_ps(r22,ewtabscale);
850	ewitab = _mm_cvttps_epi32(ewrt);
851	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
852	ewitab = _mm_slli_epi32(ewitab,2);
853	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
854	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
855	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
856	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
857	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
858	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
859	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
860	velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
861	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
862
863	d = _mm_sub_ps(r22,rswitch);
864	d = _mm_max_ps(d,_mm_setzero_ps());
865	d2 = _mm_mul_ps(d,d);
866	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
867
868	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
869
870	/* Evaluate switch function */
871	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
872	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
873	velec = _mm_mul_ps(velec,sw);
874	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
875
876	/* Update potential sum for this i atom from the interaction with this j atom. */
877	velec = _mm_and_ps(velec,cutoff_mask);
878	velecsum = _mm_add_ps(velecsum,velec);
879
880	fscal = felec;
881
882	fscal = _mm_and_ps(fscal,cutoff_mask);
883
884	/* Calculate temporary vectorial force */
885	tx = _mm_mul_ps(fscal,dx22);
886	ty = _mm_mul_ps(fscal,dy22);
887	tz = _mm_mul_ps(fscal,dz22);
888
889	/* Update vectorial force */
890	fix2 = _mm_add_ps(fix2,tx);
891	fiy2 = _mm_add_ps(fiy2,ty);
892	fiz2 = _mm_add_ps(fiz2,tz);
893
894	fjx2 = _mm_add_ps(fjx2,tx);
895	fjy2 = _mm_add_ps(fjy2,ty);
896	fjz2 = _mm_add_ps(fjz2,tz);
897
898	}
899
900	fjptrA = f+j_coord_offsetA;
901	fjptrB = f+j_coord_offsetB;
902	fjptrC = f+j_coord_offsetC;
903	fjptrD = f+j_coord_offsetD;
904
905	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
906	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
907
908	/* Inner loop uses 603 flops */
909	}
910
911	if(jidx<j_index_end)
912	{
913
914	/* Get j neighbor index, and coordinate index */
915	jnrlistA = jjnr[jidx];
916	jnrlistB = jjnr[jidx+1];
917	jnrlistC = jjnr[jidx+2];
918	jnrlistD = jjnr[jidx+3];
919	/* Sign of each element will be negative for non-real atoms.
920	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
921	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
922	*/
923	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
924	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
925	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
926	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
927	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
928	j_coord_offsetA = DIM3*jnrA;
929	j_coord_offsetB = DIM3*jnrB;
930	j_coord_offsetC = DIM3*jnrC;
931	j_coord_offsetD = DIM3*jnrD;
932
933	/* load j atom coordinates */
934	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
935	x+j_coord_offsetC,x+j_coord_offsetD,
936	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
937
938	/* Calculate displacement vector */
939	dx00 = _mm_sub_ps(ix0,jx0);
940	dy00 = _mm_sub_ps(iy0,jy0);
941	dz00 = _mm_sub_ps(iz0,jz0);
942	dx01 = _mm_sub_ps(ix0,jx1);
943	dy01 = _mm_sub_ps(iy0,jy1);
944	dz01 = _mm_sub_ps(iz0,jz1);
945	dx02 = _mm_sub_ps(ix0,jx2);
946	dy02 = _mm_sub_ps(iy0,jy2);
947	dz02 = _mm_sub_ps(iz0,jz2);
948	dx10 = _mm_sub_ps(ix1,jx0);
949	dy10 = _mm_sub_ps(iy1,jy0);
950	dz10 = _mm_sub_ps(iz1,jz0);
951	dx11 = _mm_sub_ps(ix1,jx1);
952	dy11 = _mm_sub_ps(iy1,jy1);
953	dz11 = _mm_sub_ps(iz1,jz1);
954	dx12 = _mm_sub_ps(ix1,jx2);
955	dy12 = _mm_sub_ps(iy1,jy2);
956	dz12 = _mm_sub_ps(iz1,jz2);
957	dx20 = _mm_sub_ps(ix2,jx0);
958	dy20 = _mm_sub_ps(iy2,jy0);
959	dz20 = _mm_sub_ps(iz2,jz0);
960	dx21 = _mm_sub_ps(ix2,jx1);
961	dy21 = _mm_sub_ps(iy2,jy1);
962	dz21 = _mm_sub_ps(iz2,jz1);
963	dx22 = _mm_sub_ps(ix2,jx2);
964	dy22 = _mm_sub_ps(iy2,jy2);
965	dz22 = _mm_sub_ps(iz2,jz2);
966
967	/* Calculate squared distance and things based on it */
968	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
969	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
970	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
971	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
972	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
973	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
974	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
975	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
976	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
977
978	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
979	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
980	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
981	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
982	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
983	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
984	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
985	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
986	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
987
988	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
989	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
990	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
991	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
992	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
993	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
994	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
995	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
996	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
997
998	fjx0 = _mm_setzero_ps();
999	fjy0 = _mm_setzero_ps();
1000	fjz0 = _mm_setzero_ps();
1001	fjx1 = _mm_setzero_ps();
1002	fjy1 = _mm_setzero_ps();
1003	fjz1 = _mm_setzero_ps();
1004	fjx2 = _mm_setzero_ps();
1005	fjy2 = _mm_setzero_ps();
1006	fjz2 = _mm_setzero_ps();
1007
1008	/**************************
1009	* CALCULATE INTERACTIONS *
1010	**************************/
1011
1012	if (gmx_mm_any_lt(rsq00,rcutoff2))
1013	{
1014
1015	r00 = _mm_mul_ps(rsq00,rinv00);
1016	r00 = _mm_andnot_ps(dummy_mask,r00);
1017
1018	/* EWALD ELECTROSTATICS */
1019
1020	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1021	ewrt = _mm_mul_ps(r00,ewtabscale);
1022	ewitab = _mm_cvttps_epi32(ewrt);
1023	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1024	ewitab = _mm_slli_epi32(ewitab,2);
1025	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1026	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1027	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1028	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1029	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1030	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1031	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1032	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1033	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1034
1035	/* LENNARD-JONES DISPERSION/REPULSION */
1036
1037	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1038	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1039	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1040	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1041	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1042
1043	d = _mm_sub_ps(r00,rswitch);
1044	d = _mm_max_ps(d,_mm_setzero_ps());
1045	d2 = _mm_mul_ps(d,d);
1046	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1047
1048	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1049
1050	/* Evaluate switch function */
1051	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1052	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1053	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1054	velec = _mm_mul_ps(velec,sw);
1055	vvdw = _mm_mul_ps(vvdw,sw);
1056	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1057
1058	/* Update potential sum for this i atom from the interaction with this j atom. */
1059	velec = _mm_and_ps(velec,cutoff_mask);
1060	velec = _mm_andnot_ps(dummy_mask,velec);
1061	velecsum = _mm_add_ps(velecsum,velec);
1062	vvdw = _mm_and_ps(vvdw,cutoff_mask);
1063	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1064	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1065
1066	fscal = _mm_add_ps(felec,fvdw);
1067
1068	fscal = _mm_and_ps(fscal,cutoff_mask);
1069
1070	fscal = _mm_andnot_ps(dummy_mask,fscal);
1071
1072	/* Calculate temporary vectorial force */
1073	tx = _mm_mul_ps(fscal,dx00);
1074	ty = _mm_mul_ps(fscal,dy00);
1075	tz = _mm_mul_ps(fscal,dz00);
1076
1077	/* Update vectorial force */
1078	fix0 = _mm_add_ps(fix0,tx);
1079	fiy0 = _mm_add_ps(fiy0,ty);
1080	fiz0 = _mm_add_ps(fiz0,tz);
1081
1082	fjx0 = _mm_add_ps(fjx0,tx);
1083	fjy0 = _mm_add_ps(fjy0,ty);
1084	fjz0 = _mm_add_ps(fjz0,tz);
1085
1086	}
1087
1088	/**************************
1089	* CALCULATE INTERACTIONS *
1090	**************************/
1091
1092	if (gmx_mm_any_lt(rsq01,rcutoff2))
1093	{
1094
1095	r01 = _mm_mul_ps(rsq01,rinv01);
1096	r01 = _mm_andnot_ps(dummy_mask,r01);
1097
1098	/* EWALD ELECTROSTATICS */
1099
1100	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1101	ewrt = _mm_mul_ps(r01,ewtabscale);
1102	ewitab = _mm_cvttps_epi32(ewrt);
1103	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1104	ewitab = _mm_slli_epi32(ewitab,2);
1105	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1106	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1107	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1108	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1109	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1110	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1111	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1112	velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1113	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1114
1115	d = _mm_sub_ps(r01,rswitch);
1116	d = _mm_max_ps(d,_mm_setzero_ps());
1117	d2 = _mm_mul_ps(d,d);
1118	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1119
1120	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1121
1122	/* Evaluate switch function */
1123	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1124	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1125	velec = _mm_mul_ps(velec,sw);
1126	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1127
1128	/* Update potential sum for this i atom from the interaction with this j atom. */
1129	velec = _mm_and_ps(velec,cutoff_mask);
1130	velec = _mm_andnot_ps(dummy_mask,velec);
1131	velecsum = _mm_add_ps(velecsum,velec);
1132
1133	fscal = felec;
1134
1135	fscal = _mm_and_ps(fscal,cutoff_mask);
1136
1137	fscal = _mm_andnot_ps(dummy_mask,fscal);
1138
1139	/* Calculate temporary vectorial force */
1140	tx = _mm_mul_ps(fscal,dx01);
1141	ty = _mm_mul_ps(fscal,dy01);
1142	tz = _mm_mul_ps(fscal,dz01);
1143
1144	/* Update vectorial force */
1145	fix0 = _mm_add_ps(fix0,tx);
1146	fiy0 = _mm_add_ps(fiy0,ty);
1147	fiz0 = _mm_add_ps(fiz0,tz);
1148
1149	fjx1 = _mm_add_ps(fjx1,tx);
1150	fjy1 = _mm_add_ps(fjy1,ty);
1151	fjz1 = _mm_add_ps(fjz1,tz);
1152
1153	}
1154
1155	/**************************
1156	* CALCULATE INTERACTIONS *
1157	**************************/
1158
1159	if (gmx_mm_any_lt(rsq02,rcutoff2))
1160	{
1161
1162	r02 = _mm_mul_ps(rsq02,rinv02);
1163	r02 = _mm_andnot_ps(dummy_mask,r02);
1164
1165	/* EWALD ELECTROSTATICS */
1166
1167	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1168	ewrt = _mm_mul_ps(r02,ewtabscale);
1169	ewitab = _mm_cvttps_epi32(ewrt);
1170	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1171	ewitab = _mm_slli_epi32(ewitab,2);
1172	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1173	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1174	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1175	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1176	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1177	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1178	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1179	velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1180	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1181
1182	d = _mm_sub_ps(r02,rswitch);
1183	d = _mm_max_ps(d,_mm_setzero_ps());
1184	d2 = _mm_mul_ps(d,d);
1185	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1186
1187	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1188
1189	/* Evaluate switch function */
1190	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1191	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1192	velec = _mm_mul_ps(velec,sw);
1193	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1194
1195	/* Update potential sum for this i atom from the interaction with this j atom. */
1196	velec = _mm_and_ps(velec,cutoff_mask);
1197	velec = _mm_andnot_ps(dummy_mask,velec);
1198	velecsum = _mm_add_ps(velecsum,velec);
1199
1200	fscal = felec;
1201
1202	fscal = _mm_and_ps(fscal,cutoff_mask);
1203
1204	fscal = _mm_andnot_ps(dummy_mask,fscal);
1205
1206	/* Calculate temporary vectorial force */
1207	tx = _mm_mul_ps(fscal,dx02);
1208	ty = _mm_mul_ps(fscal,dy02);
1209	tz = _mm_mul_ps(fscal,dz02);
1210
1211	/* Update vectorial force */
1212	fix0 = _mm_add_ps(fix0,tx);
1213	fiy0 = _mm_add_ps(fiy0,ty);
1214	fiz0 = _mm_add_ps(fiz0,tz);
1215
1216	fjx2 = _mm_add_ps(fjx2,tx);
1217	fjy2 = _mm_add_ps(fjy2,ty);
1218	fjz2 = _mm_add_ps(fjz2,tz);
1219
1220	}
1221
1222	/**************************
1223	* CALCULATE INTERACTIONS *
1224	**************************/
1225
1226	if (gmx_mm_any_lt(rsq10,rcutoff2))
1227	{
1228
1229	r10 = _mm_mul_ps(rsq10,rinv10);
1230	r10 = _mm_andnot_ps(dummy_mask,r10);
1231
1232	/* EWALD ELECTROSTATICS */
1233
1234	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1235	ewrt = _mm_mul_ps(r10,ewtabscale);
1236	ewitab = _mm_cvttps_epi32(ewrt);
1237	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1238	ewitab = _mm_slli_epi32(ewitab,2);
1239	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1240	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1241	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1242	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1243	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1244	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1245	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1246	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1247	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1248
1249	d = _mm_sub_ps(r10,rswitch);
1250	d = _mm_max_ps(d,_mm_setzero_ps());
1251	d2 = _mm_mul_ps(d,d);
1252	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1253
1254	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1255
1256	/* Evaluate switch function */
1257	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1258	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1259	velec = _mm_mul_ps(velec,sw);
1260	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1261
1262	/* Update potential sum for this i atom from the interaction with this j atom. */
1263	velec = _mm_and_ps(velec,cutoff_mask);
1264	velec = _mm_andnot_ps(dummy_mask,velec);
1265	velecsum = _mm_add_ps(velecsum,velec);
1266
1267	fscal = felec;
1268
1269	fscal = _mm_and_ps(fscal,cutoff_mask);
1270
1271	fscal = _mm_andnot_ps(dummy_mask,fscal);
1272
1273	/* Calculate temporary vectorial force */
1274	tx = _mm_mul_ps(fscal,dx10);
1275	ty = _mm_mul_ps(fscal,dy10);
1276	tz = _mm_mul_ps(fscal,dz10);
1277
1278	/* Update vectorial force */
1279	fix1 = _mm_add_ps(fix1,tx);
1280	fiy1 = _mm_add_ps(fiy1,ty);
1281	fiz1 = _mm_add_ps(fiz1,tz);
1282
1283	fjx0 = _mm_add_ps(fjx0,tx);
1284	fjy0 = _mm_add_ps(fjy0,ty);
1285	fjz0 = _mm_add_ps(fjz0,tz);
1286
1287	}
1288
1289	/**************************
1290	* CALCULATE INTERACTIONS *
1291	**************************/
1292
1293	if (gmx_mm_any_lt(rsq11,rcutoff2))
1294	{
1295
1296	r11 = _mm_mul_ps(rsq11,rinv11);
1297	r11 = _mm_andnot_ps(dummy_mask,r11);
1298
1299	/* EWALD ELECTROSTATICS */
1300
1301	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1302	ewrt = _mm_mul_ps(r11,ewtabscale);
1303	ewitab = _mm_cvttps_epi32(ewrt);
1304	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1305	ewitab = _mm_slli_epi32(ewitab,2);
1306	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1307	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1308	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1309	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1310	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1311	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1312	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1313	velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1314	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1315
1316	d = _mm_sub_ps(r11,rswitch);
1317	d = _mm_max_ps(d,_mm_setzero_ps());
1318	d2 = _mm_mul_ps(d,d);
1319	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1320
1321	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1322
1323	/* Evaluate switch function */
1324	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1325	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1326	velec = _mm_mul_ps(velec,sw);
1327	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1328
1329	/* Update potential sum for this i atom from the interaction with this j atom. */
1330	velec = _mm_and_ps(velec,cutoff_mask);
1331	velec = _mm_andnot_ps(dummy_mask,velec);
1332	velecsum = _mm_add_ps(velecsum,velec);
1333
1334	fscal = felec;
1335
1336	fscal = _mm_and_ps(fscal,cutoff_mask);
1337
1338	fscal = _mm_andnot_ps(dummy_mask,fscal);
1339
1340	/* Calculate temporary vectorial force */
1341	tx = _mm_mul_ps(fscal,dx11);
1342	ty = _mm_mul_ps(fscal,dy11);
1343	tz = _mm_mul_ps(fscal,dz11);
1344
1345	/* Update vectorial force */
1346	fix1 = _mm_add_ps(fix1,tx);
1347	fiy1 = _mm_add_ps(fiy1,ty);
1348	fiz1 = _mm_add_ps(fiz1,tz);
1349
1350	fjx1 = _mm_add_ps(fjx1,tx);
1351	fjy1 = _mm_add_ps(fjy1,ty);
1352	fjz1 = _mm_add_ps(fjz1,tz);
1353
1354	}
1355
1356	/**************************
1357	* CALCULATE INTERACTIONS *
1358	**************************/
1359
1360	if (gmx_mm_any_lt(rsq12,rcutoff2))
1361	{
1362
1363	r12 = _mm_mul_ps(rsq12,rinv12);
1364	r12 = _mm_andnot_ps(dummy_mask,r12);
1365
1366	/* EWALD ELECTROSTATICS */
1367
1368	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1369	ewrt = _mm_mul_ps(r12,ewtabscale);
1370	ewitab = _mm_cvttps_epi32(ewrt);
1371	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1372	ewitab = _mm_slli_epi32(ewitab,2);
1373	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1374	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1375	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1376	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1377	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1378	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1379	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1380	velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1381	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1382
1383	d = _mm_sub_ps(r12,rswitch);
1384	d = _mm_max_ps(d,_mm_setzero_ps());
1385	d2 = _mm_mul_ps(d,d);
1386	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1387
1388	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1389
1390	/* Evaluate switch function */
1391	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1392	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1393	velec = _mm_mul_ps(velec,sw);
1394	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1395
1396	/* Update potential sum for this i atom from the interaction with this j atom. */
1397	velec = _mm_and_ps(velec,cutoff_mask);
1398	velec = _mm_andnot_ps(dummy_mask,velec);
1399	velecsum = _mm_add_ps(velecsum,velec);
1400
1401	fscal = felec;
1402
1403	fscal = _mm_and_ps(fscal,cutoff_mask);
1404
1405	fscal = _mm_andnot_ps(dummy_mask,fscal);
1406
1407	/* Calculate temporary vectorial force */
1408	tx = _mm_mul_ps(fscal,dx12);
1409	ty = _mm_mul_ps(fscal,dy12);
1410	tz = _mm_mul_ps(fscal,dz12);
1411
1412	/* Update vectorial force */
1413	fix1 = _mm_add_ps(fix1,tx);
1414	fiy1 = _mm_add_ps(fiy1,ty);
1415	fiz1 = _mm_add_ps(fiz1,tz);
1416
1417	fjx2 = _mm_add_ps(fjx2,tx);
1418	fjy2 = _mm_add_ps(fjy2,ty);
1419	fjz2 = _mm_add_ps(fjz2,tz);
1420
1421	}
1422
1423	/**************************
1424	* CALCULATE INTERACTIONS *
1425	**************************/
1426
1427	if (gmx_mm_any_lt(rsq20,rcutoff2))
1428	{
1429
1430	r20 = _mm_mul_ps(rsq20,rinv20);
1431	r20 = _mm_andnot_ps(dummy_mask,r20);
1432
1433	/* EWALD ELECTROSTATICS */
1434
1435	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1436	ewrt = _mm_mul_ps(r20,ewtabscale);
1437	ewitab = _mm_cvttps_epi32(ewrt);
1438	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1439	ewitab = _mm_slli_epi32(ewitab,2);
1440	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1441	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1442	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1443	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1444	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1445	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1446	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1447	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1448	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1449
1450	d = _mm_sub_ps(r20,rswitch);
1451	d = _mm_max_ps(d,_mm_setzero_ps());
1452	d2 = _mm_mul_ps(d,d);
1453	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1454
1455	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1456
1457	/* Evaluate switch function */
1458	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1459	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1460	velec = _mm_mul_ps(velec,sw);
1461	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1462
1463	/* Update potential sum for this i atom from the interaction with this j atom. */
1464	velec = _mm_and_ps(velec,cutoff_mask);
1465	velec = _mm_andnot_ps(dummy_mask,velec);
1466	velecsum = _mm_add_ps(velecsum,velec);
1467
1468	fscal = felec;
1469
1470	fscal = _mm_and_ps(fscal,cutoff_mask);
1471
1472	fscal = _mm_andnot_ps(dummy_mask,fscal);
1473
1474	/* Calculate temporary vectorial force */
1475	tx = _mm_mul_ps(fscal,dx20);
1476	ty = _mm_mul_ps(fscal,dy20);
1477	tz = _mm_mul_ps(fscal,dz20);
1478
1479	/* Update vectorial force */
1480	fix2 = _mm_add_ps(fix2,tx);
1481	fiy2 = _mm_add_ps(fiy2,ty);
1482	fiz2 = _mm_add_ps(fiz2,tz);
1483
1484	fjx0 = _mm_add_ps(fjx0,tx);
1485	fjy0 = _mm_add_ps(fjy0,ty);
1486	fjz0 = _mm_add_ps(fjz0,tz);
1487
1488	}
1489
1490	/**************************
1491	* CALCULATE INTERACTIONS *
1492	**************************/
1493
1494	if (gmx_mm_any_lt(rsq21,rcutoff2))
1495	{
1496
1497	r21 = _mm_mul_ps(rsq21,rinv21);
1498	r21 = _mm_andnot_ps(dummy_mask,r21);
1499
1500	/* EWALD ELECTROSTATICS */
1501
1502	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1503	ewrt = _mm_mul_ps(r21,ewtabscale);
1504	ewitab = _mm_cvttps_epi32(ewrt);
1505	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1506	ewitab = _mm_slli_epi32(ewitab,2);
1507	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1508	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1509	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1510	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1511	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1512	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1513	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1514	velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1515	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1516
1517	d = _mm_sub_ps(r21,rswitch);
1518	d = _mm_max_ps(d,_mm_setzero_ps());
1519	d2 = _mm_mul_ps(d,d);
1520	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1521
1522	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1523
1524	/* Evaluate switch function */
1525	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1526	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1527	velec = _mm_mul_ps(velec,sw);
1528	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1529
1530	/* Update potential sum for this i atom from the interaction with this j atom. */
1531	velec = _mm_and_ps(velec,cutoff_mask);
1532	velec = _mm_andnot_ps(dummy_mask,velec);
1533	velecsum = _mm_add_ps(velecsum,velec);
1534
1535	fscal = felec;
1536
1537	fscal = _mm_and_ps(fscal,cutoff_mask);
1538
1539	fscal = _mm_andnot_ps(dummy_mask,fscal);
1540
1541	/* Calculate temporary vectorial force */
1542	tx = _mm_mul_ps(fscal,dx21);
1543	ty = _mm_mul_ps(fscal,dy21);
1544	tz = _mm_mul_ps(fscal,dz21);
1545
1546	/* Update vectorial force */
1547	fix2 = _mm_add_ps(fix2,tx);
1548	fiy2 = _mm_add_ps(fiy2,ty);
1549	fiz2 = _mm_add_ps(fiz2,tz);
1550
1551	fjx1 = _mm_add_ps(fjx1,tx);
1552	fjy1 = _mm_add_ps(fjy1,ty);
1553	fjz1 = _mm_add_ps(fjz1,tz);
1554
1555	}
1556
1557	/**************************
1558	* CALCULATE INTERACTIONS *
1559	**************************/
1560
1561	if (gmx_mm_any_lt(rsq22,rcutoff2))
1562	{
1563
1564	r22 = _mm_mul_ps(rsq22,rinv22);
1565	r22 = _mm_andnot_ps(dummy_mask,r22);
1566
1567	/* EWALD ELECTROSTATICS */
1568
1569	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1570	ewrt = _mm_mul_ps(r22,ewtabscale);
1571	ewitab = _mm_cvttps_epi32(ewrt);
1572	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1573	ewitab = _mm_slli_epi32(ewitab,2);
1574	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1575	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1576	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1577	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1578	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1579	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1580	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1581	velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1582	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1583
1584	d = _mm_sub_ps(r22,rswitch);
1585	d = _mm_max_ps(d,_mm_setzero_ps());
1586	d2 = _mm_mul_ps(d,d);
1587	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1588
1589	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1590
1591	/* Evaluate switch function */
1592	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1593	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1594	velec = _mm_mul_ps(velec,sw);
1595	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1596
1597	/* Update potential sum for this i atom from the interaction with this j atom. */
1598	velec = _mm_and_ps(velec,cutoff_mask);
1599	velec = _mm_andnot_ps(dummy_mask,velec);
1600	velecsum = _mm_add_ps(velecsum,velec);
1601
1602	fscal = felec;
1603
1604	fscal = _mm_and_ps(fscal,cutoff_mask);
1605
1606	fscal = _mm_andnot_ps(dummy_mask,fscal);
1607
1608	/* Calculate temporary vectorial force */
1609	tx = _mm_mul_ps(fscal,dx22);
1610	ty = _mm_mul_ps(fscal,dy22);
1611	tz = _mm_mul_ps(fscal,dz22);
1612
1613	/* Update vectorial force */
1614	fix2 = _mm_add_ps(fix2,tx);
1615	fiy2 = _mm_add_ps(fiy2,ty);
1616	fiz2 = _mm_add_ps(fiz2,tz);
1617
1618	fjx2 = _mm_add_ps(fjx2,tx);
1619	fjy2 = _mm_add_ps(fjy2,ty);
1620	fjz2 = _mm_add_ps(fjz2,tz);
1621
1622	}
1623
1624	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1625	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1626	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1627	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1628
1629	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1630	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1631
1632	/* Inner loop uses 612 flops */
1633	}
1634
1635	/* End of innermost loop */
1636
1637	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1638	f+i_coord_offset,fshift+i_shift_offset);
1639
1640	ggid = gid[iidx];
1641	/* Update potential energies */
1642	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1643	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1644
1645	/* Increment number of inner iterations */
1646	inneriter += j_index_end - j_index_start;
1647
1648	/* Outer loop uses 20 flops */
1649	}
1650
1651	/* Increment number of outer iterations */
1652	outeriter += nri;
1653
1654	/* Update outer/inner flops */
1655
1656	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter20 + inneriter612)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter20 + inneriter612;
1657	}
1658	/*
1659	* Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1660	* Electrostatics interaction: Ewald
1661	* VdW interaction: LennardJones
1662	* Geometry: Water3-Water3
1663	* Calculate force/pot: Force
1664	*/
1665	void
1666	nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1667	(t_nblist * gmx_restrict nlist,
1668	rvec * gmx_restrict xx,
1669	rvec * gmx_restrict ff,
1670	t_forcerec * gmx_restrict fr,
1671	t_mdatoms * gmx_restrict mdatoms,
1672	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1673	t_nrnb * gmx_restrict nrnb)
1674	{
1675	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1676	* just 0 for non-waters.
1677	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1678	* jnr indices corresponding to data put in the four positions in the SIMD register.
1679	*/
1680	int i_shift_offset,i_coord_offset,outeriter,inneriter;
1681	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1682	int jnrA,jnrB,jnrC,jnrD;
1683	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1684	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1685	int iinr,jindex,jjnr,shiftidx,*gid;
1686	real rcutoff_scalar;
1687	real shiftvec,fshift,x,f;
1688	real fjptrA,fjptrB,fjptrC,fjptrD;
1689	real scratch[4*DIM3];
1690	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1691	int vdwioffset0;
1692	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1693	int vdwioffset1;
1694	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1695	int vdwioffset2;
1696	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1697	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1698	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1699	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1700	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1701	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1702	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1703	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1704	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1705	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1706	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1707	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1708	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1709	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1710	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1711	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1712	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
1713	real *charge;
1714	int nvdwtype;
1715	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1716	int *vdwtype;
1717	real *vdwparam;
1718	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
1719	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1720	__m128i ewitab;
1721	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1722	real *ewtab;
1723	__m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1724	real rswitch_scalar,d_scalar;
1725	__m128 dummy_mask,cutoff_mask;
1726	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1727	__m128 one = _mm_set1_ps(1.0);
1728	__m128 two = _mm_set1_ps(2.0);
1729	x = xx[0];
1730	f = ff[0];
1731
1732	nri = nlist->nri;
1733	iinr = nlist->iinr;
1734	jindex = nlist->jindex;
1735	jjnr = nlist->jjnr;
1736	shiftidx = nlist->shift;
1737	gid = nlist->gid;
1738	shiftvec = fr->shift_vec[0];
1739	fshift = fr->fshift[0];
1740	facel = _mm_set1_ps(fr->epsfac);
1741	charge = mdatoms->chargeA;
1742	nvdwtype = fr->ntype;
1743	vdwparam = fr->nbfp;
1744	vdwtype = mdatoms->typeA;
1745
1746	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1747	ewtab = fr->ic->tabq_coul_FDV0;
1748	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1749	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1750
1751	/* Setup water-specific parameters */
1752	inr = nlist->iinr[0];
1753	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1754	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1755	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1756	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
1757
1758	jq0 = _mm_set1_ps(charge[inr+0]);
1759	jq1 = _mm_set1_ps(charge[inr+1]);
1760	jq2 = _mm_set1_ps(charge[inr+2]);
1761	vdwjidx0A = 2*vdwtype[inr+0];
1762	qq00 = _mm_mul_ps(iq0,jq0);
1763	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1764	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1765	qq01 = _mm_mul_ps(iq0,jq1);
1766	qq02 = _mm_mul_ps(iq0,jq2);
1767	qq10 = _mm_mul_ps(iq1,jq0);
1768	qq11 = _mm_mul_ps(iq1,jq1);
1769	qq12 = _mm_mul_ps(iq1,jq2);
1770	qq20 = _mm_mul_ps(iq2,jq0);
1771	qq21 = _mm_mul_ps(iq2,jq1);
1772	qq22 = _mm_mul_ps(iq2,jq2);
1773
1774	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1775	rcutoff_scalar = fr->rcoulomb;
1776	rcutoff = _mm_set1_ps(rcutoff_scalar);
1777	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1778
1779	rswitch_scalar = fr->rcoulomb_switch;
1780	rswitch = _mm_set1_ps(rswitch_scalar);
1781	/* Setup switch parameters */
1782	d_scalar = rcutoff_scalar-rswitch_scalar;
1783	d = _mm_set1_ps(d_scalar);
1784	swV3 = _mm_set1_ps(-10.0/(d_scalard_scalard_scalar));
1785	swV4 = _mm_set1_ps( 15.0/(d_scalard_scalard_scalar*d_scalar));
1786	swV5 = _mm_set1_ps( -6.0/(d_scalard_scalard_scalard_scalard_scalar));
1787	swF2 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalar));
1788	swF3 = _mm_set1_ps( 60.0/(d_scalard_scalard_scalar*d_scalar));
1789	swF4 = _mm_set1_ps(-30.0/(d_scalard_scalard_scalard_scalard_scalar));
1790
1791	/* Avoid stupid compiler warnings */
1792	jnrA = jnrB = jnrC = jnrD = 0;
1793	j_coord_offsetA = 0;
1794	j_coord_offsetB = 0;
1795	j_coord_offsetC = 0;
1796	j_coord_offsetD = 0;
1797
1798	outeriter = 0;
1799	inneriter = 0;
1800
1801	for(iidx=0;iidx<4*DIM3;iidx++)
1802	{
1803	scratch[iidx] = 0.0;
1804	}
1805
1806	/* Start outer loop over neighborlists */
1807	for(iidx=0; iidx<nri; iidx++)
1808	{
1809	/* Load shift vector for this list */
1810	i_shift_offset = DIM3*shiftidx[iidx];
1811
1812	/* Load limits for loop over neighbors */
1813	j_index_start = jindex[iidx];
1814	j_index_end = jindex[iidx+1];
1815
1816	/* Get outer coordinate index */
1817	inr = iinr[iidx];
1818	i_coord_offset = DIM3*inr;
1819
1820	/* Load i particle coords and add shift vector */
1821	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1822	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1823
1824	fix0 = _mm_setzero_ps();
1825	fiy0 = _mm_setzero_ps();
1826	fiz0 = _mm_setzero_ps();
1827	fix1 = _mm_setzero_ps();
1828	fiy1 = _mm_setzero_ps();
1829	fiz1 = _mm_setzero_ps();
1830	fix2 = _mm_setzero_ps();
1831	fiy2 = _mm_setzero_ps();
1832	fiz2 = _mm_setzero_ps();
1833
1834	/* Start inner kernel loop */
1835	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1836	{
1837
1838	/* Get j neighbor index, and coordinate index */
1839	jnrA = jjnr[jidx];
1840	jnrB = jjnr[jidx+1];
1841	jnrC = jjnr[jidx+2];
1842	jnrD = jjnr[jidx+3];
1843	j_coord_offsetA = DIM3*jnrA;
1844	j_coord_offsetB = DIM3*jnrB;
1845	j_coord_offsetC = DIM3*jnrC;
1846	j_coord_offsetD = DIM3*jnrD;
1847
1848	/* load j atom coordinates */
1849	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1850	x+j_coord_offsetC,x+j_coord_offsetD,
1851	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1852
1853	/* Calculate displacement vector */
1854	dx00 = _mm_sub_ps(ix0,jx0);
1855	dy00 = _mm_sub_ps(iy0,jy0);
1856	dz00 = _mm_sub_ps(iz0,jz0);
1857	dx01 = _mm_sub_ps(ix0,jx1);
1858	dy01 = _mm_sub_ps(iy0,jy1);
1859	dz01 = _mm_sub_ps(iz0,jz1);
1860	dx02 = _mm_sub_ps(ix0,jx2);
1861	dy02 = _mm_sub_ps(iy0,jy2);
1862	dz02 = _mm_sub_ps(iz0,jz2);
1863	dx10 = _mm_sub_ps(ix1,jx0);
1864	dy10 = _mm_sub_ps(iy1,jy0);
1865	dz10 = _mm_sub_ps(iz1,jz0);
1866	dx11 = _mm_sub_ps(ix1,jx1);
1867	dy11 = _mm_sub_ps(iy1,jy1);
1868	dz11 = _mm_sub_ps(iz1,jz1);
1869	dx12 = _mm_sub_ps(ix1,jx2);
1870	dy12 = _mm_sub_ps(iy1,jy2);
1871	dz12 = _mm_sub_ps(iz1,jz2);
1872	dx20 = _mm_sub_ps(ix2,jx0);
1873	dy20 = _mm_sub_ps(iy2,jy0);
1874	dz20 = _mm_sub_ps(iz2,jz0);
1875	dx21 = _mm_sub_ps(ix2,jx1);
1876	dy21 = _mm_sub_ps(iy2,jy1);
1877	dz21 = _mm_sub_ps(iz2,jz1);
1878	dx22 = _mm_sub_ps(ix2,jx2);
1879	dy22 = _mm_sub_ps(iy2,jy2);
1880	dz22 = _mm_sub_ps(iz2,jz2);
1881
1882	/* Calculate squared distance and things based on it */
1883	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1884	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1885	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1886	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1887	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1888	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1889	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1890	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1891	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1892
1893	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1894	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1895	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1896	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1897	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1898	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1899	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1900	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1901	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1902
1903	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1904	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1905	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1906	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1907	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1908	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1909	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1910	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1911	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1912
1913	fjx0 = _mm_setzero_ps();
1914	fjy0 = _mm_setzero_ps();
1915	fjz0 = _mm_setzero_ps();
1916	fjx1 = _mm_setzero_ps();
1917	fjy1 = _mm_setzero_ps();
1918	fjz1 = _mm_setzero_ps();
1919	fjx2 = _mm_setzero_ps();
1920	fjy2 = _mm_setzero_ps();
1921	fjz2 = _mm_setzero_ps();
1922
1923	/**************************
1924	* CALCULATE INTERACTIONS *
1925	**************************/
1926
1927	if (gmx_mm_any_lt(rsq00,rcutoff2))
1928	{
1929
1930	r00 = _mm_mul_ps(rsq00,rinv00);
1931
1932	/* EWALD ELECTROSTATICS */
1933
1934	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1935	ewrt = _mm_mul_ps(r00,ewtabscale);
1936	ewitab = _mm_cvttps_epi32(ewrt);
1937	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1938	ewitab = _mm_slli_epi32(ewitab,2);
1939	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1940	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1941	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1942	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1943	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1944	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1945	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1946	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1947	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1948
1949	/* LENNARD-JONES DISPERSION/REPULSION */
1950
1951	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1952	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1953	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1954	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1955	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1956
1957	d = _mm_sub_ps(r00,rswitch);
1958	d = _mm_max_ps(d,_mm_setzero_ps());
1959	d2 = _mm_mul_ps(d,d);
1960	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1961
1962	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1963
1964	/* Evaluate switch function */
1965	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
1966	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1967	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1968	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1969
1970	fscal = _mm_add_ps(felec,fvdw);
1971
1972	fscal = _mm_and_ps(fscal,cutoff_mask);
1973
1974	/* Calculate temporary vectorial force */
1975	tx = _mm_mul_ps(fscal,dx00);
1976	ty = _mm_mul_ps(fscal,dy00);
1977	tz = _mm_mul_ps(fscal,dz00);
1978
1979	/* Update vectorial force */
1980	fix0 = _mm_add_ps(fix0,tx);
1981	fiy0 = _mm_add_ps(fiy0,ty);
1982	fiz0 = _mm_add_ps(fiz0,tz);
1983
1984	fjx0 = _mm_add_ps(fjx0,tx);
1985	fjy0 = _mm_add_ps(fjy0,ty);
1986	fjz0 = _mm_add_ps(fjz0,tz);
1987
1988	}
1989
1990	/**************************
1991	* CALCULATE INTERACTIONS *
1992	**************************/
1993
1994	if (gmx_mm_any_lt(rsq01,rcutoff2))
1995	{
1996
1997	r01 = _mm_mul_ps(rsq01,rinv01);
1998
1999	/* EWALD ELECTROSTATICS */
2000
2001	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2002	ewrt = _mm_mul_ps(r01,ewtabscale);
2003	ewitab = _mm_cvttps_epi32(ewrt);
2004	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2005	ewitab = _mm_slli_epi32(ewitab,2);
2006	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2007	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2008	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2009	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2010	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2011	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2012	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2013	velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2014	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2015
2016	d = _mm_sub_ps(r01,rswitch);
2017	d = _mm_max_ps(d,_mm_setzero_ps());
2018	d2 = _mm_mul_ps(d,d);
2019	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2020
2021	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2022
2023	/* Evaluate switch function */
2024	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2025	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2026	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2027
2028	fscal = felec;
2029
2030	fscal = _mm_and_ps(fscal,cutoff_mask);
2031
2032	/* Calculate temporary vectorial force */
2033	tx = _mm_mul_ps(fscal,dx01);
2034	ty = _mm_mul_ps(fscal,dy01);
2035	tz = _mm_mul_ps(fscal,dz01);
2036
2037	/* Update vectorial force */
2038	fix0 = _mm_add_ps(fix0,tx);
2039	fiy0 = _mm_add_ps(fiy0,ty);
2040	fiz0 = _mm_add_ps(fiz0,tz);
2041
2042	fjx1 = _mm_add_ps(fjx1,tx);
2043	fjy1 = _mm_add_ps(fjy1,ty);
2044	fjz1 = _mm_add_ps(fjz1,tz);
2045
2046	}
2047
2048	/**************************
2049	* CALCULATE INTERACTIONS *
2050	**************************/
2051
2052	if (gmx_mm_any_lt(rsq02,rcutoff2))
2053	{
2054
2055	r02 = _mm_mul_ps(rsq02,rinv02);
2056
2057	/* EWALD ELECTROSTATICS */
2058
2059	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060	ewrt = _mm_mul_ps(r02,ewtabscale);
2061	ewitab = _mm_cvttps_epi32(ewrt);
2062	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2063	ewitab = _mm_slli_epi32(ewitab,2);
2064	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2065	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2066	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2067	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2068	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2069	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2070	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2071	velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2072	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2073
2074	d = _mm_sub_ps(r02,rswitch);
2075	d = _mm_max_ps(d,_mm_setzero_ps());
2076	d2 = _mm_mul_ps(d,d);
2077	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2078
2079	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2080
2081	/* Evaluate switch function */
2082	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2083	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2084	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2085
2086	fscal = felec;
2087
2088	fscal = _mm_and_ps(fscal,cutoff_mask);
2089
2090	/* Calculate temporary vectorial force */
2091	tx = _mm_mul_ps(fscal,dx02);
2092	ty = _mm_mul_ps(fscal,dy02);
2093	tz = _mm_mul_ps(fscal,dz02);
2094
2095	/* Update vectorial force */
2096	fix0 = _mm_add_ps(fix0,tx);
2097	fiy0 = _mm_add_ps(fiy0,ty);
2098	fiz0 = _mm_add_ps(fiz0,tz);
2099
2100	fjx2 = _mm_add_ps(fjx2,tx);
2101	fjy2 = _mm_add_ps(fjy2,ty);
2102	fjz2 = _mm_add_ps(fjz2,tz);
2103
2104	}
2105
2106	/**************************
2107	* CALCULATE INTERACTIONS *
2108	**************************/
2109
2110	if (gmx_mm_any_lt(rsq10,rcutoff2))
2111	{
2112
2113	r10 = _mm_mul_ps(rsq10,rinv10);
2114
2115	/* EWALD ELECTROSTATICS */
2116
2117	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2118	ewrt = _mm_mul_ps(r10,ewtabscale);
2119	ewitab = _mm_cvttps_epi32(ewrt);
2120	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2121	ewitab = _mm_slli_epi32(ewitab,2);
2122	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2123	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2124	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2125	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2126	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2127	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2128	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2129	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2130	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2131
2132	d = _mm_sub_ps(r10,rswitch);
2133	d = _mm_max_ps(d,_mm_setzero_ps());
2134	d2 = _mm_mul_ps(d,d);
2135	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2136
2137	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2138
2139	/* Evaluate switch function */
2140	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2141	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2142	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2143
2144	fscal = felec;
2145
2146	fscal = _mm_and_ps(fscal,cutoff_mask);
2147
2148	/* Calculate temporary vectorial force */
2149	tx = _mm_mul_ps(fscal,dx10);
2150	ty = _mm_mul_ps(fscal,dy10);
2151	tz = _mm_mul_ps(fscal,dz10);
2152
2153	/* Update vectorial force */
2154	fix1 = _mm_add_ps(fix1,tx);
2155	fiy1 = _mm_add_ps(fiy1,ty);
2156	fiz1 = _mm_add_ps(fiz1,tz);
2157
2158	fjx0 = _mm_add_ps(fjx0,tx);
2159	fjy0 = _mm_add_ps(fjy0,ty);
2160	fjz0 = _mm_add_ps(fjz0,tz);
2161
2162	}
2163
2164	/**************************
2165	* CALCULATE INTERACTIONS *
2166	**************************/
2167
2168	if (gmx_mm_any_lt(rsq11,rcutoff2))
2169	{
2170
2171	r11 = _mm_mul_ps(rsq11,rinv11);
2172
2173	/* EWALD ELECTROSTATICS */
2174
2175	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2176	ewrt = _mm_mul_ps(r11,ewtabscale);
2177	ewitab = _mm_cvttps_epi32(ewrt);
2178	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2179	ewitab = _mm_slli_epi32(ewitab,2);
2180	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2181	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2182	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2183	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2184	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2185	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2186	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2187	velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2188	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2189
2190	d = _mm_sub_ps(r11,rswitch);
2191	d = _mm_max_ps(d,_mm_setzero_ps());
2192	d2 = _mm_mul_ps(d,d);
2193	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2194
2195	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2196
2197	/* Evaluate switch function */
2198	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2199	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2200	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2201
2202	fscal = felec;
2203
2204	fscal = _mm_and_ps(fscal,cutoff_mask);
2205
2206	/* Calculate temporary vectorial force */
2207	tx = _mm_mul_ps(fscal,dx11);
2208	ty = _mm_mul_ps(fscal,dy11);
2209	tz = _mm_mul_ps(fscal,dz11);
2210
2211	/* Update vectorial force */
2212	fix1 = _mm_add_ps(fix1,tx);
2213	fiy1 = _mm_add_ps(fiy1,ty);
2214	fiz1 = _mm_add_ps(fiz1,tz);
2215
2216	fjx1 = _mm_add_ps(fjx1,tx);
2217	fjy1 = _mm_add_ps(fjy1,ty);
2218	fjz1 = _mm_add_ps(fjz1,tz);
2219
2220	}
2221
2222	/**************************
2223	* CALCULATE INTERACTIONS *
2224	**************************/
2225
2226	if (gmx_mm_any_lt(rsq12,rcutoff2))
2227	{
2228
2229	r12 = _mm_mul_ps(rsq12,rinv12);
2230
2231	/* EWALD ELECTROSTATICS */
2232
2233	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2234	ewrt = _mm_mul_ps(r12,ewtabscale);
2235	ewitab = _mm_cvttps_epi32(ewrt);
2236	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2237	ewitab = _mm_slli_epi32(ewitab,2);
2238	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2239	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2240	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2241	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2242	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2243	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2244	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2245	velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2246	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2247
2248	d = _mm_sub_ps(r12,rswitch);
2249	d = _mm_max_ps(d,_mm_setzero_ps());
2250	d2 = _mm_mul_ps(d,d);
2251	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2252
2253	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2254
2255	/* Evaluate switch function */
2256	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2257	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2258	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2259
2260	fscal = felec;
2261
2262	fscal = _mm_and_ps(fscal,cutoff_mask);
2263
2264	/* Calculate temporary vectorial force */
2265	tx = _mm_mul_ps(fscal,dx12);
2266	ty = _mm_mul_ps(fscal,dy12);
2267	tz = _mm_mul_ps(fscal,dz12);
2268
2269	/* Update vectorial force */
2270	fix1 = _mm_add_ps(fix1,tx);
2271	fiy1 = _mm_add_ps(fiy1,ty);
2272	fiz1 = _mm_add_ps(fiz1,tz);
2273
2274	fjx2 = _mm_add_ps(fjx2,tx);
2275	fjy2 = _mm_add_ps(fjy2,ty);
2276	fjz2 = _mm_add_ps(fjz2,tz);
2277
2278	}
2279
2280	/**************************
2281	* CALCULATE INTERACTIONS *
2282	**************************/
2283
2284	if (gmx_mm_any_lt(rsq20,rcutoff2))
2285	{
2286
2287	r20 = _mm_mul_ps(rsq20,rinv20);
2288
2289	/* EWALD ELECTROSTATICS */
2290
2291	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292	ewrt = _mm_mul_ps(r20,ewtabscale);
2293	ewitab = _mm_cvttps_epi32(ewrt);
2294	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2295	ewitab = _mm_slli_epi32(ewitab,2);
2296	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2297	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2298	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2299	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2300	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2301	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2302	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2303	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2304	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2305
2306	d = _mm_sub_ps(r20,rswitch);
2307	d = _mm_max_ps(d,_mm_setzero_ps());
2308	d2 = _mm_mul_ps(d,d);
2309	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2310
2311	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2312
2313	/* Evaluate switch function */
2314	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2315	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2316	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2317
2318	fscal = felec;
2319
2320	fscal = _mm_and_ps(fscal,cutoff_mask);
2321
2322	/* Calculate temporary vectorial force */
2323	tx = _mm_mul_ps(fscal,dx20);
2324	ty = _mm_mul_ps(fscal,dy20);
2325	tz = _mm_mul_ps(fscal,dz20);
2326
2327	/* Update vectorial force */
2328	fix2 = _mm_add_ps(fix2,tx);
2329	fiy2 = _mm_add_ps(fiy2,ty);
2330	fiz2 = _mm_add_ps(fiz2,tz);
2331
2332	fjx0 = _mm_add_ps(fjx0,tx);
2333	fjy0 = _mm_add_ps(fjy0,ty);
2334	fjz0 = _mm_add_ps(fjz0,tz);
2335
2336	}
2337
2338	/**************************
2339	* CALCULATE INTERACTIONS *
2340	**************************/
2341
2342	if (gmx_mm_any_lt(rsq21,rcutoff2))
2343	{
2344
2345	r21 = _mm_mul_ps(rsq21,rinv21);
2346
2347	/* EWALD ELECTROSTATICS */
2348
2349	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350	ewrt = _mm_mul_ps(r21,ewtabscale);
2351	ewitab = _mm_cvttps_epi32(ewrt);
2352	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2353	ewitab = _mm_slli_epi32(ewitab,2);
2354	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2355	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2356	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2357	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2358	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2359	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2360	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2361	velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2362	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2363
2364	d = _mm_sub_ps(r21,rswitch);
2365	d = _mm_max_ps(d,_mm_setzero_ps());
2366	d2 = _mm_mul_ps(d,d);
2367	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2368
2369	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2370
2371	/* Evaluate switch function */
2372	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2373	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2374	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2375
2376	fscal = felec;
2377
2378	fscal = _mm_and_ps(fscal,cutoff_mask);
2379
2380	/* Calculate temporary vectorial force */
2381	tx = _mm_mul_ps(fscal,dx21);
2382	ty = _mm_mul_ps(fscal,dy21);
2383	tz = _mm_mul_ps(fscal,dz21);
2384
2385	/* Update vectorial force */
2386	fix2 = _mm_add_ps(fix2,tx);
2387	fiy2 = _mm_add_ps(fiy2,ty);
2388	fiz2 = _mm_add_ps(fiz2,tz);
2389
2390	fjx1 = _mm_add_ps(fjx1,tx);
2391	fjy1 = _mm_add_ps(fjy1,ty);
2392	fjz1 = _mm_add_ps(fjz1,tz);
2393
2394	}
2395
2396	/**************************
2397	* CALCULATE INTERACTIONS *
2398	**************************/
2399
2400	if (gmx_mm_any_lt(rsq22,rcutoff2))
2401	{
2402
2403	r22 = _mm_mul_ps(rsq22,rinv22);
2404
2405	/* EWALD ELECTROSTATICS */
2406
2407	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2408	ewrt = _mm_mul_ps(r22,ewtabscale);
2409	ewitab = _mm_cvttps_epi32(ewrt);
2410	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2411	ewitab = _mm_slli_epi32(ewitab,2);
2412	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2413	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2414	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2415	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2416	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2417	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2418	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2419	velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2420	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2421
2422	d = _mm_sub_ps(r22,rswitch);
2423	d = _mm_max_ps(d,_mm_setzero_ps());
2424	d2 = _mm_mul_ps(d,d);
2425	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2426
2427	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2428
2429	/* Evaluate switch function */
2430	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2431	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2432	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2433
2434	fscal = felec;
2435
2436	fscal = _mm_and_ps(fscal,cutoff_mask);
2437
2438	/* Calculate temporary vectorial force */
2439	tx = _mm_mul_ps(fscal,dx22);
2440	ty = _mm_mul_ps(fscal,dy22);
2441	tz = _mm_mul_ps(fscal,dz22);
2442
2443	/* Update vectorial force */
2444	fix2 = _mm_add_ps(fix2,tx);
2445	fiy2 = _mm_add_ps(fiy2,ty);
2446	fiz2 = _mm_add_ps(fiz2,tz);
2447
2448	fjx2 = _mm_add_ps(fjx2,tx);
2449	fjy2 = _mm_add_ps(fjy2,ty);
2450	fjz2 = _mm_add_ps(fjz2,tz);
2451
2452	}
2453
2454	fjptrA = f+j_coord_offsetA;
2455	fjptrB = f+j_coord_offsetB;
2456	fjptrC = f+j_coord_offsetC;
2457	fjptrD = f+j_coord_offsetD;
2458
2459	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2460	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2461
2462	/* Inner loop uses 573 flops */
2463	}
2464
2465	if(jidx<j_index_end)
2466	{
2467
2468	/* Get j neighbor index, and coordinate index */
2469	jnrlistA = jjnr[jidx];
2470	jnrlistB = jjnr[jidx+1];
2471	jnrlistC = jjnr[jidx+2];
2472	jnrlistD = jjnr[jidx+3];
2473	/* Sign of each element will be negative for non-real atoms.
2474	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2475	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2476	*/
2477	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2478	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2479	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2480	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2481	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2482	j_coord_offsetA = DIM3*jnrA;
2483	j_coord_offsetB = DIM3*jnrB;
2484	j_coord_offsetC = DIM3*jnrC;
2485	j_coord_offsetD = DIM3*jnrD;
2486
2487	/* load j atom coordinates */
2488	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2489	x+j_coord_offsetC,x+j_coord_offsetD,
2490	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2491
2492	/* Calculate displacement vector */
2493	dx00 = _mm_sub_ps(ix0,jx0);
2494	dy00 = _mm_sub_ps(iy0,jy0);
2495	dz00 = _mm_sub_ps(iz0,jz0);
2496	dx01 = _mm_sub_ps(ix0,jx1);
2497	dy01 = _mm_sub_ps(iy0,jy1);
2498	dz01 = _mm_sub_ps(iz0,jz1);
2499	dx02 = _mm_sub_ps(ix0,jx2);
2500	dy02 = _mm_sub_ps(iy0,jy2);
2501	dz02 = _mm_sub_ps(iz0,jz2);
2502	dx10 = _mm_sub_ps(ix1,jx0);
2503	dy10 = _mm_sub_ps(iy1,jy0);
2504	dz10 = _mm_sub_ps(iz1,jz0);
2505	dx11 = _mm_sub_ps(ix1,jx1);
2506	dy11 = _mm_sub_ps(iy1,jy1);
2507	dz11 = _mm_sub_ps(iz1,jz1);
2508	dx12 = _mm_sub_ps(ix1,jx2);
2509	dy12 = _mm_sub_ps(iy1,jy2);
2510	dz12 = _mm_sub_ps(iz1,jz2);
2511	dx20 = _mm_sub_ps(ix2,jx0);
2512	dy20 = _mm_sub_ps(iy2,jy0);
2513	dz20 = _mm_sub_ps(iz2,jz0);
2514	dx21 = _mm_sub_ps(ix2,jx1);
2515	dy21 = _mm_sub_ps(iy2,jy1);
2516	dz21 = _mm_sub_ps(iz2,jz1);
2517	dx22 = _mm_sub_ps(ix2,jx2);
2518	dy22 = _mm_sub_ps(iy2,jy2);
2519	dz22 = _mm_sub_ps(iz2,jz2);
2520
2521	/* Calculate squared distance and things based on it */
2522	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2523	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2524	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2525	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2526	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2527	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2528	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2529	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2530	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2531
2532	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2533	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
2534	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
2535	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
2536	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2537	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2538	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
2539	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2540	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2541
2542	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2543	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2544	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2545	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2546	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2547	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2548	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2549	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2550	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2551
2552	fjx0 = _mm_setzero_ps();
2553	fjy0 = _mm_setzero_ps();
2554	fjz0 = _mm_setzero_ps();
2555	fjx1 = _mm_setzero_ps();
2556	fjy1 = _mm_setzero_ps();
2557	fjz1 = _mm_setzero_ps();
2558	fjx2 = _mm_setzero_ps();
2559	fjy2 = _mm_setzero_ps();
2560	fjz2 = _mm_setzero_ps();
2561
2562	/**************************
2563	* CALCULATE INTERACTIONS *
2564	**************************/
2565
2566	if (gmx_mm_any_lt(rsq00,rcutoff2))
2567	{
2568
2569	r00 = _mm_mul_ps(rsq00,rinv00);
2570	r00 = _mm_andnot_ps(dummy_mask,r00);
2571
2572	/* EWALD ELECTROSTATICS */
2573
2574	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2575	ewrt = _mm_mul_ps(r00,ewtabscale);
2576	ewitab = _mm_cvttps_epi32(ewrt);
2577	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2578	ewitab = _mm_slli_epi32(ewitab,2);
2579	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2580	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2581	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2582	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2583	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2584	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2585	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2586	velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2587	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2588
2589	/* LENNARD-JONES DISPERSION/REPULSION */
2590
2591	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2592	vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2593	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2594	vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2595	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2596
2597	d = _mm_sub_ps(r00,rswitch);
2598	d = _mm_max_ps(d,_mm_setzero_ps());
2599	d2 = _mm_mul_ps(d,d);
2600	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2601
2602	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2603
2604	/* Evaluate switch function */
2605	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2606	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2607	fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2608	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2609
2610	fscal = _mm_add_ps(felec,fvdw);
2611
2612	fscal = _mm_and_ps(fscal,cutoff_mask);
2613
2614	fscal = _mm_andnot_ps(dummy_mask,fscal);
2615
2616	/* Calculate temporary vectorial force */
2617	tx = _mm_mul_ps(fscal,dx00);
2618	ty = _mm_mul_ps(fscal,dy00);
2619	tz = _mm_mul_ps(fscal,dz00);
2620
2621	/* Update vectorial force */
2622	fix0 = _mm_add_ps(fix0,tx);
2623	fiy0 = _mm_add_ps(fiy0,ty);
2624	fiz0 = _mm_add_ps(fiz0,tz);
2625
2626	fjx0 = _mm_add_ps(fjx0,tx);
2627	fjy0 = _mm_add_ps(fjy0,ty);
2628	fjz0 = _mm_add_ps(fjz0,tz);
2629
2630	}
2631
2632	/**************************
2633	* CALCULATE INTERACTIONS *
2634	**************************/
2635
2636	if (gmx_mm_any_lt(rsq01,rcutoff2))
2637	{
2638
2639	r01 = _mm_mul_ps(rsq01,rinv01);
2640	r01 = _mm_andnot_ps(dummy_mask,r01);
2641
2642	/* EWALD ELECTROSTATICS */
2643
2644	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2645	ewrt = _mm_mul_ps(r01,ewtabscale);
2646	ewitab = _mm_cvttps_epi32(ewrt);
2647	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2648	ewitab = _mm_slli_epi32(ewitab,2);
2649	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2650	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2651	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2652	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2653	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2654	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2655	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2656	velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2657	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2658
2659	d = _mm_sub_ps(r01,rswitch);
2660	d = _mm_max_ps(d,_mm_setzero_ps());
2661	d2 = _mm_mul_ps(d,d);
2662	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2663
2664	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2665
2666	/* Evaluate switch function */
2667	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2668	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2669	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2670
2671	fscal = felec;
2672
2673	fscal = _mm_and_ps(fscal,cutoff_mask);
2674
2675	fscal = _mm_andnot_ps(dummy_mask,fscal);
2676
2677	/* Calculate temporary vectorial force */
2678	tx = _mm_mul_ps(fscal,dx01);
2679	ty = _mm_mul_ps(fscal,dy01);
2680	tz = _mm_mul_ps(fscal,dz01);
2681
2682	/* Update vectorial force */
2683	fix0 = _mm_add_ps(fix0,tx);
2684	fiy0 = _mm_add_ps(fiy0,ty);
2685	fiz0 = _mm_add_ps(fiz0,tz);
2686
2687	fjx1 = _mm_add_ps(fjx1,tx);
2688	fjy1 = _mm_add_ps(fjy1,ty);
2689	fjz1 = _mm_add_ps(fjz1,tz);
2690
2691	}
2692
2693	/**************************
2694	* CALCULATE INTERACTIONS *
2695	**************************/
2696
2697	if (gmx_mm_any_lt(rsq02,rcutoff2))
2698	{
2699
2700	r02 = _mm_mul_ps(rsq02,rinv02);
2701	r02 = _mm_andnot_ps(dummy_mask,r02);
2702
2703	/* EWALD ELECTROSTATICS */
2704
2705	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2706	ewrt = _mm_mul_ps(r02,ewtabscale);
2707	ewitab = _mm_cvttps_epi32(ewrt);
2708	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2709	ewitab = _mm_slli_epi32(ewitab,2);
2710	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2711	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2712	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2713	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2714	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2715	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2716	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2717	velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2718	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2719
2720	d = _mm_sub_ps(r02,rswitch);
2721	d = _mm_max_ps(d,_mm_setzero_ps());
2722	d2 = _mm_mul_ps(d,d);
2723	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2724
2725	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2726
2727	/* Evaluate switch function */
2728	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2729	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2730	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2731
2732	fscal = felec;
2733
2734	fscal = _mm_and_ps(fscal,cutoff_mask);
2735
2736	fscal = _mm_andnot_ps(dummy_mask,fscal);
2737
2738	/* Calculate temporary vectorial force */
2739	tx = _mm_mul_ps(fscal,dx02);
2740	ty = _mm_mul_ps(fscal,dy02);
2741	tz = _mm_mul_ps(fscal,dz02);
2742
2743	/* Update vectorial force */
2744	fix0 = _mm_add_ps(fix0,tx);
2745	fiy0 = _mm_add_ps(fiy0,ty);
2746	fiz0 = _mm_add_ps(fiz0,tz);
2747
2748	fjx2 = _mm_add_ps(fjx2,tx);
2749	fjy2 = _mm_add_ps(fjy2,ty);
2750	fjz2 = _mm_add_ps(fjz2,tz);
2751
2752	}
2753
2754	/**************************
2755	* CALCULATE INTERACTIONS *
2756	**************************/
2757
2758	if (gmx_mm_any_lt(rsq10,rcutoff2))
2759	{
2760
2761	r10 = _mm_mul_ps(rsq10,rinv10);
2762	r10 = _mm_andnot_ps(dummy_mask,r10);
2763
2764	/* EWALD ELECTROSTATICS */
2765
2766	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2767	ewrt = _mm_mul_ps(r10,ewtabscale);
2768	ewitab = _mm_cvttps_epi32(ewrt);
2769	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2770	ewitab = _mm_slli_epi32(ewitab,2);
2771	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2772	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2773	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2774	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2775	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2776	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2777	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2778	velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2779	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2780
2781	d = _mm_sub_ps(r10,rswitch);
2782	d = _mm_max_ps(d,_mm_setzero_ps());
2783	d2 = _mm_mul_ps(d,d);
2784	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2785
2786	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2787
2788	/* Evaluate switch function */
2789	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2790	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2791	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2792
2793	fscal = felec;
2794
2795	fscal = _mm_and_ps(fscal,cutoff_mask);
2796
2797	fscal = _mm_andnot_ps(dummy_mask,fscal);
2798
2799	/* Calculate temporary vectorial force */
2800	tx = _mm_mul_ps(fscal,dx10);
2801	ty = _mm_mul_ps(fscal,dy10);
2802	tz = _mm_mul_ps(fscal,dz10);
2803
2804	/* Update vectorial force */
2805	fix1 = _mm_add_ps(fix1,tx);
2806	fiy1 = _mm_add_ps(fiy1,ty);
2807	fiz1 = _mm_add_ps(fiz1,tz);
2808
2809	fjx0 = _mm_add_ps(fjx0,tx);
2810	fjy0 = _mm_add_ps(fjy0,ty);
2811	fjz0 = _mm_add_ps(fjz0,tz);
2812
2813	}
2814
2815	/**************************
2816	* CALCULATE INTERACTIONS *
2817	**************************/
2818
2819	if (gmx_mm_any_lt(rsq11,rcutoff2))
2820	{
2821
2822	r11 = _mm_mul_ps(rsq11,rinv11);
2823	r11 = _mm_andnot_ps(dummy_mask,r11);
2824
2825	/* EWALD ELECTROSTATICS */
2826
2827	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2828	ewrt = _mm_mul_ps(r11,ewtabscale);
2829	ewitab = _mm_cvttps_epi32(ewrt);
2830	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2831	ewitab = _mm_slli_epi32(ewitab,2);
2832	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2833	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2834	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2835	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2836	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2837	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2838	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2839	velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2840	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2841
2842	d = _mm_sub_ps(r11,rswitch);
2843	d = _mm_max_ps(d,_mm_setzero_ps());
2844	d2 = _mm_mul_ps(d,d);
2845	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2846
2847	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2848
2849	/* Evaluate switch function */
2850	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2851	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2852	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2853
2854	fscal = felec;
2855
2856	fscal = _mm_and_ps(fscal,cutoff_mask);
2857
2858	fscal = _mm_andnot_ps(dummy_mask,fscal);
2859
2860	/* Calculate temporary vectorial force */
2861	tx = _mm_mul_ps(fscal,dx11);
2862	ty = _mm_mul_ps(fscal,dy11);
2863	tz = _mm_mul_ps(fscal,dz11);
2864
2865	/* Update vectorial force */
2866	fix1 = _mm_add_ps(fix1,tx);
2867	fiy1 = _mm_add_ps(fiy1,ty);
2868	fiz1 = _mm_add_ps(fiz1,tz);
2869
2870	fjx1 = _mm_add_ps(fjx1,tx);
2871	fjy1 = _mm_add_ps(fjy1,ty);
2872	fjz1 = _mm_add_ps(fjz1,tz);
2873
2874	}
2875
2876	/**************************
2877	* CALCULATE INTERACTIONS *
2878	**************************/
2879
2880	if (gmx_mm_any_lt(rsq12,rcutoff2))
2881	{
2882
2883	r12 = _mm_mul_ps(rsq12,rinv12);
2884	r12 = _mm_andnot_ps(dummy_mask,r12);
2885
2886	/* EWALD ELECTROSTATICS */
2887
2888	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2889	ewrt = _mm_mul_ps(r12,ewtabscale);
2890	ewitab = _mm_cvttps_epi32(ewrt);
2891	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2892	ewitab = _mm_slli_epi32(ewitab,2);
2893	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2894	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2895	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2896	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2897	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2898	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2899	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2900	velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2901	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2902
2903	d = _mm_sub_ps(r12,rswitch);
2904	d = _mm_max_ps(d,_mm_setzero_ps());
2905	d2 = _mm_mul_ps(d,d);
2906	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2907
2908	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2909
2910	/* Evaluate switch function */
2911	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2912	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2913	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2914
2915	fscal = felec;
2916
2917	fscal = _mm_and_ps(fscal,cutoff_mask);
2918
2919	fscal = _mm_andnot_ps(dummy_mask,fscal);
2920
2921	/* Calculate temporary vectorial force */
2922	tx = _mm_mul_ps(fscal,dx12);
2923	ty = _mm_mul_ps(fscal,dy12);
2924	tz = _mm_mul_ps(fscal,dz12);
2925
2926	/* Update vectorial force */
2927	fix1 = _mm_add_ps(fix1,tx);
2928	fiy1 = _mm_add_ps(fiy1,ty);
2929	fiz1 = _mm_add_ps(fiz1,tz);
2930
2931	fjx2 = _mm_add_ps(fjx2,tx);
2932	fjy2 = _mm_add_ps(fjy2,ty);
2933	fjz2 = _mm_add_ps(fjz2,tz);
2934
2935	}
2936
2937	/**************************
2938	* CALCULATE INTERACTIONS *
2939	**************************/
2940
2941	if (gmx_mm_any_lt(rsq20,rcutoff2))
2942	{
2943
2944	r20 = _mm_mul_ps(rsq20,rinv20);
2945	r20 = _mm_andnot_ps(dummy_mask,r20);
2946
2947	/* EWALD ELECTROSTATICS */
2948
2949	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2950	ewrt = _mm_mul_ps(r20,ewtabscale);
2951	ewitab = _mm_cvttps_epi32(ewrt);
2952	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2953	ewitab = _mm_slli_epi32(ewitab,2);
2954	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
2955	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
2956	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
2957	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
2958	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
2959	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2960	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2961	velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2962	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2963
2964	d = _mm_sub_ps(r20,rswitch);
2965	d = _mm_max_ps(d,_mm_setzero_ps());
2966	d2 = _mm_mul_ps(d,d);
2967	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2968
2969	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2970
2971	/* Evaluate switch function */
2972	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
2973	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2974	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2975
2976	fscal = felec;
2977
2978	fscal = _mm_and_ps(fscal,cutoff_mask);
2979
2980	fscal = _mm_andnot_ps(dummy_mask,fscal);
2981
2982	/* Calculate temporary vectorial force */
2983	tx = _mm_mul_ps(fscal,dx20);
2984	ty = _mm_mul_ps(fscal,dy20);
2985	tz = _mm_mul_ps(fscal,dz20);
2986
2987	/* Update vectorial force */
2988	fix2 = _mm_add_ps(fix2,tx);
2989	fiy2 = _mm_add_ps(fiy2,ty);
2990	fiz2 = _mm_add_ps(fiz2,tz);
2991
2992	fjx0 = _mm_add_ps(fjx0,tx);
2993	fjy0 = _mm_add_ps(fjy0,ty);
2994	fjz0 = _mm_add_ps(fjz0,tz);
2995
2996	}
2997
2998	/**************************
2999	* CALCULATE INTERACTIONS *
3000	**************************/
3001
3002	if (gmx_mm_any_lt(rsq21,rcutoff2))
3003	{
3004
3005	r21 = _mm_mul_ps(rsq21,rinv21);
3006	r21 = _mm_andnot_ps(dummy_mask,r21);
3007
3008	/* EWALD ELECTROSTATICS */
3009
3010	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3011	ewrt = _mm_mul_ps(r21,ewtabscale);
3012	ewitab = _mm_cvttps_epi32(ewrt);
3013	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
3014	ewitab = _mm_slli_epi32(ewitab,2);
3015	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
3016	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
3017	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
3018	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
3019	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
3020	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3021	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3022	velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3023	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3024
3025	d = _mm_sub_ps(r21,rswitch);
3026	d = _mm_max_ps(d,_mm_setzero_ps());
3027	d2 = _mm_mul_ps(d,d);
3028	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3029
3030	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3031
3032	/* Evaluate switch function */
3033	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
3034	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3035	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3036
3037	fscal = felec;
3038
3039	fscal = _mm_and_ps(fscal,cutoff_mask);
3040
3041	fscal = _mm_andnot_ps(dummy_mask,fscal);
3042
3043	/* Calculate temporary vectorial force */
3044	tx = _mm_mul_ps(fscal,dx21);
3045	ty = _mm_mul_ps(fscal,dy21);
3046	tz = _mm_mul_ps(fscal,dz21);
3047
3048	/* Update vectorial force */
3049	fix2 = _mm_add_ps(fix2,tx);
3050	fiy2 = _mm_add_ps(fiy2,ty);
3051	fiz2 = _mm_add_ps(fiz2,tz);
3052
3053	fjx1 = _mm_add_ps(fjx1,tx);
3054	fjy1 = _mm_add_ps(fjy1,ty);
3055	fjz1 = _mm_add_ps(fjz1,tz);
3056
3057	}
3058
3059	/**************************
3060	* CALCULATE INTERACTIONS *
3061	**************************/
3062
3063	if (gmx_mm_any_lt(rsq22,rcutoff2))
3064	{
3065
3066	r22 = _mm_mul_ps(rsq22,rinv22);
3067	r22 = _mm_andnot_ps(dummy_mask,r22);
3068
3069	/* EWALD ELECTROSTATICS */
3070
3071	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3072	ewrt = _mm_mul_ps(r22,ewtabscale);
3073	ewitab = _mm_cvttps_epi32(ewrt);
3074	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
3075	ewitab = _mm_slli_epi32(ewitab,2);
3076	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
3077	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
3078	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
3079	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
3080	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
3081	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3082	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3083	velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3084	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3085
3086	d = _mm_sub_ps(r22,rswitch);
3087	d = _mm_max_ps(d,_mm_setzero_ps());
3088	d2 = _mm_mul_ps(d,d);
3089	sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3090
3091	dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3092
3093	/* Evaluate switch function */
3094	/* fscal'=f'/r=-(vsw)'/r=-(v'sw+vdsw)/r=-v'sw/r-vdsw/r=fscalsw-vdsw/r /
3095	felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3096	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3097
3098	fscal = felec;
3099
3100	fscal = _mm_and_ps(fscal,cutoff_mask);
3101
3102	fscal = _mm_andnot_ps(dummy_mask,fscal);
3103
3104	/* Calculate temporary vectorial force */
3105	tx = _mm_mul_ps(fscal,dx22);
3106	ty = _mm_mul_ps(fscal,dy22);
3107	tz = _mm_mul_ps(fscal,dz22);
3108
3109	/* Update vectorial force */
3110	fix2 = _mm_add_ps(fix2,tx);
3111	fiy2 = _mm_add_ps(fiy2,ty);
3112	fiz2 = _mm_add_ps(fiz2,tz);
3113
3114	fjx2 = _mm_add_ps(fjx2,tx);
3115	fjy2 = _mm_add_ps(fjy2,ty);
3116	fjz2 = _mm_add_ps(fjz2,tz);
3117
3118	}
3119
3120	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3121	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3122	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3123	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3124
3125	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3126	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3127
3128	/* Inner loop uses 582 flops */
3129	}
3130
3131	/* End of innermost loop */
3132
3133	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3134	f+i_coord_offset,fshift+i_shift_offset);
3135
3136	/* Increment number of inner iterations */
3137	inneriter += j_index_end - j_index_start;
3138
3139	/* Outer loop uses 18 flops */
3140	}
3141
3142	/* Increment number of outer iterations */
3143	outeriter += nri;
3144
3145	/* Update outer/inner flops */
3146
3147	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter18 + inneriter582)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter18 + inneriter 582;
3148	}