/home/alexxy/Develop/gromacs/src/gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1

Bug Summary

File:	gromacs/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_sse4_1_single.c
Location:	line 194, column 5
Description:	Value stored to 'j_coord_offsetA' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6	* and including many others, as listed in the AUTHORS file in the
7	* top-level source directory and at http://www.gromacs.org.
8	*
9	* GROMACS is free software; you can redistribute it and/or
10	* modify it under the terms of the GNU Lesser General Public License
11	* as published by the Free Software Foundation; either version 2.1
12	* of the License, or (at your option) any later version.
13	*
14	* GROMACS is distributed in the hope that it will be useful,
15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17	* Lesser General Public License for more details.
18	*
19	* You should have received a copy of the GNU Lesser General Public
20	* License along with GROMACS; if not, see
21	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
22	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23	*
24	* If you want to redistribute modifications to GROMACS, please
25	* consider that scientific software is very special. Version
26	* control is crucial - bugs must be traceable. We will be happy to
27	* consider code for inclusion in the official distribution, but
28	* derived work must not be called official GROMACS. Details are found
29	* in the README & COPYING files - if they are missing, get the
30	* official version at http://www.gromacs.org.
31	*
32	* To help us fund GROMACS development, we humbly ask that you cite
33	* the research papers on the package. Check out http://www.gromacs.org.
34	*/
35	/*
36	* Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37	*/
38	#ifdef HAVE_CONFIG_H1
39	#include <config.h>
40	#endif
41
42	#include <math.h>
43
44	#include "../nb_kernel.h"
45	#include "types/simple.h"
46	#include "gromacs/math/vec.h"
47	#include "nrnb.h"
48
49	#include "gromacs/simd/math_x86_sse4_1_single.h"
50	#include "kernelutil_x86_sse4_1_single.h"
51
52	/*
53	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single
54	* Electrostatics interaction: Ewald
55	* VdW interaction: LJEwald
56	* Geometry: Water3-Water3
57	* Calculate force/pot: PotentialAndForce
58	*/
59	void
60	nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse4_1_single
61	(t_nblist * gmx_restrict nlist,
62	rvec * gmx_restrict xx,
63	rvec * gmx_restrict ff,
64	t_forcerec * gmx_restrict fr,
65	t_mdatoms * gmx_restrict mdatoms,
66	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
67	t_nrnb * gmx_restrict nrnb)
68	{
69	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70	* just 0 for non-waters.
71	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72	* jnr indices corresponding to data put in the four positions in the SIMD register.
73	*/
74	int i_shift_offset,i_coord_offset,outeriter,inneriter;
75	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76	int jnrA,jnrB,jnrC,jnrD;
77	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79	int iinr,jindex,jjnr,shiftidx,*gid;
80	real rcutoff_scalar;
81	real shiftvec,fshift,x,f;
82	real fjptrA,fjptrB,fjptrC,fjptrD;
83	real scratch[4*DIM3];
84	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85	int vdwioffset0;
86	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87	int vdwioffset1;
88	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89	int vdwioffset2;
90	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
107	real *charge;
108	int nvdwtype;
109	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110	int *vdwtype;
111	real *vdwparam;
112	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
113	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
114	__m128 c6grid_00;
115	__m128 c6grid_01;
116	__m128 c6grid_02;
117	__m128 c6grid_10;
118	__m128 c6grid_11;
119	__m128 c6grid_12;
120	__m128 c6grid_20;
121	__m128 c6grid_21;
122	__m128 c6grid_22;
123	__m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
124	real *vdwgridparam;
125	__m128 one_half = _mm_set1_ps(0.5);
126	__m128 minus_one = _mm_set1_ps(-1.0);
127	__m128i ewitab;
128	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
129	real *ewtab;
130	__m128 dummy_mask,cutoff_mask;
131	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
132	__m128 one = _mm_set1_ps(1.0);
133	__m128 two = _mm_set1_ps(2.0);
134	x = xx[0];
135	f = ff[0];
136
137	nri = nlist->nri;
138	iinr = nlist->iinr;
139	jindex = nlist->jindex;
140	jjnr = nlist->jjnr;
141	shiftidx = nlist->shift;
142	gid = nlist->gid;
143	shiftvec = fr->shift_vec[0];
144	fshift = fr->fshift[0];
145	facel = _mm_set1_ps(fr->epsfac);
146	charge = mdatoms->chargeA;
147	nvdwtype = fr->ntype;
148	vdwparam = fr->nbfp;
149	vdwtype = mdatoms->typeA;
150	vdwgridparam = fr->ljpme_c6grid;
151	sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
152	ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
153	ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
154
155	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
156	ewtab = fr->ic->tabq_coul_FDV0;
157	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
158	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
159
160	/* Setup water-specific parameters */
161	inr = nlist->iinr[0];
162	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
163	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
164	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
165	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
166
167	jq0 = _mm_set1_ps(charge[inr+0]);
168	jq1 = _mm_set1_ps(charge[inr+1]);
169	jq2 = _mm_set1_ps(charge[inr+2]);
170	vdwjidx0A = 2*vdwtype[inr+0];
171	qq00 = _mm_mul_ps(iq0,jq0);
172	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
173	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
174	c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
175	qq01 = _mm_mul_ps(iq0,jq1);
176	qq02 = _mm_mul_ps(iq0,jq2);
177	qq10 = _mm_mul_ps(iq1,jq0);
178	qq11 = _mm_mul_ps(iq1,jq1);
179	qq12 = _mm_mul_ps(iq1,jq2);
180	qq20 = _mm_mul_ps(iq2,jq0);
181	qq21 = _mm_mul_ps(iq2,jq1);
182	qq22 = _mm_mul_ps(iq2,jq2);
183
184	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
185	rcutoff_scalar = fr->rcoulomb;
186	rcutoff = _mm_set1_ps(rcutoff_scalar);
187	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
188
189	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
190	rvdw = _mm_set1_ps(fr->rvdw);
191
192	/* Avoid stupid compiler warnings */
193	jnrA = jnrB = jnrC = jnrD = 0;
194	j_coord_offsetA = 0;
	Value stored to 'j_coord_offsetA' is never read
195	j_coord_offsetB = 0;
196	j_coord_offsetC = 0;
197	j_coord_offsetD = 0;
198
199	outeriter = 0;
200	inneriter = 0;
201
202	for(iidx=0;iidx<4*DIM3;iidx++)
203	{
204	scratch[iidx] = 0.0;
205	}
206
207	/* Start outer loop over neighborlists */
208	for(iidx=0; iidx<nri; iidx++)
209	{
210	/* Load shift vector for this list */
211	i_shift_offset = DIM3*shiftidx[iidx];
212
213	/* Load limits for loop over neighbors */
214	j_index_start = jindex[iidx];
215	j_index_end = jindex[iidx+1];
216
217	/* Get outer coordinate index */
218	inr = iinr[iidx];
219	i_coord_offset = DIM3*inr;
220
221	/* Load i particle coords and add shift vector */
222	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
223	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
224
225	fix0 = _mm_setzero_ps();
226	fiy0 = _mm_setzero_ps();
227	fiz0 = _mm_setzero_ps();
228	fix1 = _mm_setzero_ps();
229	fiy1 = _mm_setzero_ps();
230	fiz1 = _mm_setzero_ps();
231	fix2 = _mm_setzero_ps();
232	fiy2 = _mm_setzero_ps();
233	fiz2 = _mm_setzero_ps();
234
235	/* Reset potential sums */
236	velecsum = _mm_setzero_ps();
237	vvdwsum = _mm_setzero_ps();
238
239	/* Start inner kernel loop */
240	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
241	{
242
243	/* Get j neighbor index, and coordinate index */
244	jnrA = jjnr[jidx];
245	jnrB = jjnr[jidx+1];
246	jnrC = jjnr[jidx+2];
247	jnrD = jjnr[jidx+3];
248	j_coord_offsetA = DIM3*jnrA;
249	j_coord_offsetB = DIM3*jnrB;
250	j_coord_offsetC = DIM3*jnrC;
251	j_coord_offsetD = DIM3*jnrD;
252
253	/* load j atom coordinates */
254	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
255	x+j_coord_offsetC,x+j_coord_offsetD,
256	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
257
258	/* Calculate displacement vector */
259	dx00 = _mm_sub_ps(ix0,jx0);
260	dy00 = _mm_sub_ps(iy0,jy0);
261	dz00 = _mm_sub_ps(iz0,jz0);
262	dx01 = _mm_sub_ps(ix0,jx1);
263	dy01 = _mm_sub_ps(iy0,jy1);
264	dz01 = _mm_sub_ps(iz0,jz1);
265	dx02 = _mm_sub_ps(ix0,jx2);
266	dy02 = _mm_sub_ps(iy0,jy2);
267	dz02 = _mm_sub_ps(iz0,jz2);
268	dx10 = _mm_sub_ps(ix1,jx0);
269	dy10 = _mm_sub_ps(iy1,jy0);
270	dz10 = _mm_sub_ps(iz1,jz0);
271	dx11 = _mm_sub_ps(ix1,jx1);
272	dy11 = _mm_sub_ps(iy1,jy1);
273	dz11 = _mm_sub_ps(iz1,jz1);
274	dx12 = _mm_sub_ps(ix1,jx2);
275	dy12 = _mm_sub_ps(iy1,jy2);
276	dz12 = _mm_sub_ps(iz1,jz2);
277	dx20 = _mm_sub_ps(ix2,jx0);
278	dy20 = _mm_sub_ps(iy2,jy0);
279	dz20 = _mm_sub_ps(iz2,jz0);
280	dx21 = _mm_sub_ps(ix2,jx1);
281	dy21 = _mm_sub_ps(iy2,jy1);
282	dz21 = _mm_sub_ps(iz2,jz1);
283	dx22 = _mm_sub_ps(ix2,jx2);
284	dy22 = _mm_sub_ps(iy2,jy2);
285	dz22 = _mm_sub_ps(iz2,jz2);
286
287	/* Calculate squared distance and things based on it */
288	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
289	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
290	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
291	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
292	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
293	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
294	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
295	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
296	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
297
298	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
299	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
300	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
301	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
302	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
303	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
304	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
305	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
306	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
307
308	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
309	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
310	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
311	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
312	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
313	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
314	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
315	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
316	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
317
318	fjx0 = _mm_setzero_ps();
319	fjy0 = _mm_setzero_ps();
320	fjz0 = _mm_setzero_ps();
321	fjx1 = _mm_setzero_ps();
322	fjy1 = _mm_setzero_ps();
323	fjz1 = _mm_setzero_ps();
324	fjx2 = _mm_setzero_ps();
325	fjy2 = _mm_setzero_ps();
326	fjz2 = _mm_setzero_ps();
327
328	/**************************
329	* CALCULATE INTERACTIONS *
330	**************************/
331
332	if (gmx_mm_any_lt(rsq00,rcutoff2))
333	{
334
335	r00 = _mm_mul_ps(rsq00,rinv00);
336
337	/* EWALD ELECTROSTATICS */
338
339	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
340	ewrt = _mm_mul_ps(r00,ewtabscale);
341	ewitab = _mm_cvttps_epi32(ewrt);
342	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
343	ewitab = _mm_slli_epi32(ewitab,2);
344	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
345	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
346	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
347	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
348	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
349	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
350	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
351	velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
352	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
353
354	/* Analytical LJ-PME */
355	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
356	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
357	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
358	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
359	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
360	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
361	/* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
362	vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
363	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
364	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
365	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
366	/* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
367	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
368
369	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
370
371	/* Update potential sum for this i atom from the interaction with this j atom. */
372	velec = _mm_and_ps(velec,cutoff_mask);
373	velecsum = _mm_add_ps(velecsum,velec);
374	vvdw = _mm_and_ps(vvdw,cutoff_mask);
375	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
376
377	fscal = _mm_add_ps(felec,fvdw);
378
379	fscal = _mm_and_ps(fscal,cutoff_mask);
380
381	/* Calculate temporary vectorial force */
382	tx = _mm_mul_ps(fscal,dx00);
383	ty = _mm_mul_ps(fscal,dy00);
384	tz = _mm_mul_ps(fscal,dz00);
385
386	/* Update vectorial force */
387	fix0 = _mm_add_ps(fix0,tx);
388	fiy0 = _mm_add_ps(fiy0,ty);
389	fiz0 = _mm_add_ps(fiz0,tz);
390
391	fjx0 = _mm_add_ps(fjx0,tx);
392	fjy0 = _mm_add_ps(fjy0,ty);
393	fjz0 = _mm_add_ps(fjz0,tz);
394
395	}
396
397	/**************************
398	* CALCULATE INTERACTIONS *
399	**************************/
400
401	if (gmx_mm_any_lt(rsq01,rcutoff2))
402	{
403
404	r01 = _mm_mul_ps(rsq01,rinv01);
405
406	/* EWALD ELECTROSTATICS */
407
408	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
409	ewrt = _mm_mul_ps(r01,ewtabscale);
410	ewitab = _mm_cvttps_epi32(ewrt);
411	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
412	ewitab = _mm_slli_epi32(ewitab,2);
413	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
414	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
415	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
416	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
417	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
418	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
419	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
420	velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
421	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
422
423	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
424
425	/* Update potential sum for this i atom from the interaction with this j atom. */
426	velec = _mm_and_ps(velec,cutoff_mask);
427	velecsum = _mm_add_ps(velecsum,velec);
428
429	fscal = felec;
430
431	fscal = _mm_and_ps(fscal,cutoff_mask);
432
433	/* Calculate temporary vectorial force */
434	tx = _mm_mul_ps(fscal,dx01);
435	ty = _mm_mul_ps(fscal,dy01);
436	tz = _mm_mul_ps(fscal,dz01);
437
438	/* Update vectorial force */
439	fix0 = _mm_add_ps(fix0,tx);
440	fiy0 = _mm_add_ps(fiy0,ty);
441	fiz0 = _mm_add_ps(fiz0,tz);
442
443	fjx1 = _mm_add_ps(fjx1,tx);
444	fjy1 = _mm_add_ps(fjy1,ty);
445	fjz1 = _mm_add_ps(fjz1,tz);
446
447	}
448
449	/**************************
450	* CALCULATE INTERACTIONS *
451	**************************/
452
453	if (gmx_mm_any_lt(rsq02,rcutoff2))
454	{
455
456	r02 = _mm_mul_ps(rsq02,rinv02);
457
458	/* EWALD ELECTROSTATICS */
459
460	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461	ewrt = _mm_mul_ps(r02,ewtabscale);
462	ewitab = _mm_cvttps_epi32(ewrt);
463	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
464	ewitab = _mm_slli_epi32(ewitab,2);
465	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
466	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
467	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
468	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
469	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
470	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
471	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
472	velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
473	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
474
475	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
476
477	/* Update potential sum for this i atom from the interaction with this j atom. */
478	velec = _mm_and_ps(velec,cutoff_mask);
479	velecsum = _mm_add_ps(velecsum,velec);
480
481	fscal = felec;
482
483	fscal = _mm_and_ps(fscal,cutoff_mask);
484
485	/* Calculate temporary vectorial force */
486	tx = _mm_mul_ps(fscal,dx02);
487	ty = _mm_mul_ps(fscal,dy02);
488	tz = _mm_mul_ps(fscal,dz02);
489
490	/* Update vectorial force */
491	fix0 = _mm_add_ps(fix0,tx);
492	fiy0 = _mm_add_ps(fiy0,ty);
493	fiz0 = _mm_add_ps(fiz0,tz);
494
495	fjx2 = _mm_add_ps(fjx2,tx);
496	fjy2 = _mm_add_ps(fjy2,ty);
497	fjz2 = _mm_add_ps(fjz2,tz);
498
499	}
500
501	/**************************
502	* CALCULATE INTERACTIONS *
503	**************************/
504
505	if (gmx_mm_any_lt(rsq10,rcutoff2))
506	{
507
508	r10 = _mm_mul_ps(rsq10,rinv10);
509
510	/* EWALD ELECTROSTATICS */
511
512	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513	ewrt = _mm_mul_ps(r10,ewtabscale);
514	ewitab = _mm_cvttps_epi32(ewrt);
515	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
516	ewitab = _mm_slli_epi32(ewitab,2);
517	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
518	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
519	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
520	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
521	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
522	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
523	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
524	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
525	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
526
527	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
528
529	/* Update potential sum for this i atom from the interaction with this j atom. */
530	velec = _mm_and_ps(velec,cutoff_mask);
531	velecsum = _mm_add_ps(velecsum,velec);
532
533	fscal = felec;
534
535	fscal = _mm_and_ps(fscal,cutoff_mask);
536
537	/* Calculate temporary vectorial force */
538	tx = _mm_mul_ps(fscal,dx10);
539	ty = _mm_mul_ps(fscal,dy10);
540	tz = _mm_mul_ps(fscal,dz10);
541
542	/* Update vectorial force */
543	fix1 = _mm_add_ps(fix1,tx);
544	fiy1 = _mm_add_ps(fiy1,ty);
545	fiz1 = _mm_add_ps(fiz1,tz);
546
547	fjx0 = _mm_add_ps(fjx0,tx);
548	fjy0 = _mm_add_ps(fjy0,ty);
549	fjz0 = _mm_add_ps(fjz0,tz);
550
551	}
552
553	/**************************
554	* CALCULATE INTERACTIONS *
555	**************************/
556
557	if (gmx_mm_any_lt(rsq11,rcutoff2))
558	{
559
560	r11 = _mm_mul_ps(rsq11,rinv11);
561
562	/* EWALD ELECTROSTATICS */
563
564	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
565	ewrt = _mm_mul_ps(r11,ewtabscale);
566	ewitab = _mm_cvttps_epi32(ewrt);
567	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
568	ewitab = _mm_slli_epi32(ewitab,2);
569	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
570	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
571	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
572	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
573	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
574	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
575	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
576	velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
577	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
578
579	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
580
581	/* Update potential sum for this i atom from the interaction with this j atom. */
582	velec = _mm_and_ps(velec,cutoff_mask);
583	velecsum = _mm_add_ps(velecsum,velec);
584
585	fscal = felec;
586
587	fscal = _mm_and_ps(fscal,cutoff_mask);
588
589	/* Calculate temporary vectorial force */
590	tx = _mm_mul_ps(fscal,dx11);
591	ty = _mm_mul_ps(fscal,dy11);
592	tz = _mm_mul_ps(fscal,dz11);
593
594	/* Update vectorial force */
595	fix1 = _mm_add_ps(fix1,tx);
596	fiy1 = _mm_add_ps(fiy1,ty);
597	fiz1 = _mm_add_ps(fiz1,tz);
598
599	fjx1 = _mm_add_ps(fjx1,tx);
600	fjy1 = _mm_add_ps(fjy1,ty);
601	fjz1 = _mm_add_ps(fjz1,tz);
602
603	}
604
605	/**************************
606	* CALCULATE INTERACTIONS *
607	**************************/
608
609	if (gmx_mm_any_lt(rsq12,rcutoff2))
610	{
611
612	r12 = _mm_mul_ps(rsq12,rinv12);
613
614	/* EWALD ELECTROSTATICS */
615
616	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
617	ewrt = _mm_mul_ps(r12,ewtabscale);
618	ewitab = _mm_cvttps_epi32(ewrt);
619	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
620	ewitab = _mm_slli_epi32(ewitab,2);
621	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
622	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
623	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
624	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
625	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
626	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
627	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
628	velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
629	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
630
631	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
632
633	/* Update potential sum for this i atom from the interaction with this j atom. */
634	velec = _mm_and_ps(velec,cutoff_mask);
635	velecsum = _mm_add_ps(velecsum,velec);
636
637	fscal = felec;
638
639	fscal = _mm_and_ps(fscal,cutoff_mask);
640
641	/* Calculate temporary vectorial force */
642	tx = _mm_mul_ps(fscal,dx12);
643	ty = _mm_mul_ps(fscal,dy12);
644	tz = _mm_mul_ps(fscal,dz12);
645
646	/* Update vectorial force */
647	fix1 = _mm_add_ps(fix1,tx);
648	fiy1 = _mm_add_ps(fiy1,ty);
649	fiz1 = _mm_add_ps(fiz1,tz);
650
651	fjx2 = _mm_add_ps(fjx2,tx);
652	fjy2 = _mm_add_ps(fjy2,ty);
653	fjz2 = _mm_add_ps(fjz2,tz);
654
655	}
656
657	/**************************
658	* CALCULATE INTERACTIONS *
659	**************************/
660
661	if (gmx_mm_any_lt(rsq20,rcutoff2))
662	{
663
664	r20 = _mm_mul_ps(rsq20,rinv20);
665
666	/* EWALD ELECTROSTATICS */
667
668	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
669	ewrt = _mm_mul_ps(r20,ewtabscale);
670	ewitab = _mm_cvttps_epi32(ewrt);
671	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
672	ewitab = _mm_slli_epi32(ewitab,2);
673	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
674	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
675	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
676	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
677	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
678	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
679	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
680	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
681	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
682
683	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
684
685	/* Update potential sum for this i atom from the interaction with this j atom. */
686	velec = _mm_and_ps(velec,cutoff_mask);
687	velecsum = _mm_add_ps(velecsum,velec);
688
689	fscal = felec;
690
691	fscal = _mm_and_ps(fscal,cutoff_mask);
692
693	/* Calculate temporary vectorial force */
694	tx = _mm_mul_ps(fscal,dx20);
695	ty = _mm_mul_ps(fscal,dy20);
696	tz = _mm_mul_ps(fscal,dz20);
697
698	/* Update vectorial force */
699	fix2 = _mm_add_ps(fix2,tx);
700	fiy2 = _mm_add_ps(fiy2,ty);
701	fiz2 = _mm_add_ps(fiz2,tz);
702
703	fjx0 = _mm_add_ps(fjx0,tx);
704	fjy0 = _mm_add_ps(fjy0,ty);
705	fjz0 = _mm_add_ps(fjz0,tz);
706
707	}
708
709	/**************************
710	* CALCULATE INTERACTIONS *
711	**************************/
712
713	if (gmx_mm_any_lt(rsq21,rcutoff2))
714	{
715
716	r21 = _mm_mul_ps(rsq21,rinv21);
717
718	/* EWALD ELECTROSTATICS */
719
720	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721	ewrt = _mm_mul_ps(r21,ewtabscale);
722	ewitab = _mm_cvttps_epi32(ewrt);
723	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
724	ewitab = _mm_slli_epi32(ewitab,2);
725	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
726	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
727	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
728	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
729	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
730	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
731	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
732	velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
733	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
734
735	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
736
737	/* Update potential sum for this i atom from the interaction with this j atom. */
738	velec = _mm_and_ps(velec,cutoff_mask);
739	velecsum = _mm_add_ps(velecsum,velec);
740
741	fscal = felec;
742
743	fscal = _mm_and_ps(fscal,cutoff_mask);
744
745	/* Calculate temporary vectorial force */
746	tx = _mm_mul_ps(fscal,dx21);
747	ty = _mm_mul_ps(fscal,dy21);
748	tz = _mm_mul_ps(fscal,dz21);
749
750	/* Update vectorial force */
751	fix2 = _mm_add_ps(fix2,tx);
752	fiy2 = _mm_add_ps(fiy2,ty);
753	fiz2 = _mm_add_ps(fiz2,tz);
754
755	fjx1 = _mm_add_ps(fjx1,tx);
756	fjy1 = _mm_add_ps(fjy1,ty);
757	fjz1 = _mm_add_ps(fjz1,tz);
758
759	}
760
761	/**************************
762	* CALCULATE INTERACTIONS *
763	**************************/
764
765	if (gmx_mm_any_lt(rsq22,rcutoff2))
766	{
767
768	r22 = _mm_mul_ps(rsq22,rinv22);
769
770	/* EWALD ELECTROSTATICS */
771
772	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773	ewrt = _mm_mul_ps(r22,ewtabscale);
774	ewitab = _mm_cvttps_epi32(ewrt);
775	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
776	ewitab = _mm_slli_epi32(ewitab,2);
777	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
778	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
779	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
780	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
781	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
782	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
783	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
784	velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
785	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
786
787	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
788
789	/* Update potential sum for this i atom from the interaction with this j atom. */
790	velec = _mm_and_ps(velec,cutoff_mask);
791	velecsum = _mm_add_ps(velecsum,velec);
792
793	fscal = felec;
794
795	fscal = _mm_and_ps(fscal,cutoff_mask);
796
797	/* Calculate temporary vectorial force */
798	tx = _mm_mul_ps(fscal,dx22);
799	ty = _mm_mul_ps(fscal,dy22);
800	tz = _mm_mul_ps(fscal,dz22);
801
802	/* Update vectorial force */
803	fix2 = _mm_add_ps(fix2,tx);
804	fiy2 = _mm_add_ps(fiy2,ty);
805	fiz2 = _mm_add_ps(fiz2,tz);
806
807	fjx2 = _mm_add_ps(fjx2,tx);
808	fjy2 = _mm_add_ps(fjy2,ty);
809	fjz2 = _mm_add_ps(fjz2,tz);
810
811	}
812
813	fjptrA = f+j_coord_offsetA;
814	fjptrB = f+j_coord_offsetB;
815	fjptrC = f+j_coord_offsetC;
816	fjptrD = f+j_coord_offsetD;
817
818	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
819	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
820
821	/* Inner loop uses 450 flops */
822	}
823
824	if(jidx<j_index_end)
825	{
826
827	/* Get j neighbor index, and coordinate index */
828	jnrlistA = jjnr[jidx];
829	jnrlistB = jjnr[jidx+1];
830	jnrlistC = jjnr[jidx+2];
831	jnrlistD = jjnr[jidx+3];
832	/* Sign of each element will be negative for non-real atoms.
833	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
834	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
835	*/
836	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
837	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
838	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
839	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
840	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
841	j_coord_offsetA = DIM3*jnrA;
842	j_coord_offsetB = DIM3*jnrB;
843	j_coord_offsetC = DIM3*jnrC;
844	j_coord_offsetD = DIM3*jnrD;
845
846	/* load j atom coordinates */
847	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
848	x+j_coord_offsetC,x+j_coord_offsetD,
849	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
850
851	/* Calculate displacement vector */
852	dx00 = _mm_sub_ps(ix0,jx0);
853	dy00 = _mm_sub_ps(iy0,jy0);
854	dz00 = _mm_sub_ps(iz0,jz0);
855	dx01 = _mm_sub_ps(ix0,jx1);
856	dy01 = _mm_sub_ps(iy0,jy1);
857	dz01 = _mm_sub_ps(iz0,jz1);
858	dx02 = _mm_sub_ps(ix0,jx2);
859	dy02 = _mm_sub_ps(iy0,jy2);
860	dz02 = _mm_sub_ps(iz0,jz2);
861	dx10 = _mm_sub_ps(ix1,jx0);
862	dy10 = _mm_sub_ps(iy1,jy0);
863	dz10 = _mm_sub_ps(iz1,jz0);
864	dx11 = _mm_sub_ps(ix1,jx1);
865	dy11 = _mm_sub_ps(iy1,jy1);
866	dz11 = _mm_sub_ps(iz1,jz1);
867	dx12 = _mm_sub_ps(ix1,jx2);
868	dy12 = _mm_sub_ps(iy1,jy2);
869	dz12 = _mm_sub_ps(iz1,jz2);
870	dx20 = _mm_sub_ps(ix2,jx0);
871	dy20 = _mm_sub_ps(iy2,jy0);
872	dz20 = _mm_sub_ps(iz2,jz0);
873	dx21 = _mm_sub_ps(ix2,jx1);
874	dy21 = _mm_sub_ps(iy2,jy1);
875	dz21 = _mm_sub_ps(iz2,jz1);
876	dx22 = _mm_sub_ps(ix2,jx2);
877	dy22 = _mm_sub_ps(iy2,jy2);
878	dz22 = _mm_sub_ps(iz2,jz2);
879
880	/* Calculate squared distance and things based on it */
881	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
882	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
883	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
884	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
885	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
886	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
887	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
888	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
889	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
890
891	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
892	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
893	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
894	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
895	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
896	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
897	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
898	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
899	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
900
901	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
902	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
903	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
904	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
905	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
906	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
907	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
908	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
909	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
910
911	fjx0 = _mm_setzero_ps();
912	fjy0 = _mm_setzero_ps();
913	fjz0 = _mm_setzero_ps();
914	fjx1 = _mm_setzero_ps();
915	fjy1 = _mm_setzero_ps();
916	fjz1 = _mm_setzero_ps();
917	fjx2 = _mm_setzero_ps();
918	fjy2 = _mm_setzero_ps();
919	fjz2 = _mm_setzero_ps();
920
921	/**************************
922	* CALCULATE INTERACTIONS *
923	**************************/
924
925	if (gmx_mm_any_lt(rsq00,rcutoff2))
926	{
927
928	r00 = _mm_mul_ps(rsq00,rinv00);
929	r00 = _mm_andnot_ps(dummy_mask,r00);
930
931	/* EWALD ELECTROSTATICS */
932
933	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
934	ewrt = _mm_mul_ps(r00,ewtabscale);
935	ewitab = _mm_cvttps_epi32(ewrt);
936	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
937	ewitab = _mm_slli_epi32(ewitab,2);
938	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
939	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
940	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
941	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
942	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
943	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
944	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
945	velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
946	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
947
948	/* Analytical LJ-PME */
949	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
950	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
951	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
952	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
953	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
954	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
955	/* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
956	vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
957	vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
958	vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
959	_mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
960	/* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
961	fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
962
963	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
964
965	/* Update potential sum for this i atom from the interaction with this j atom. */
966	velec = _mm_and_ps(velec,cutoff_mask);
967	velec = _mm_andnot_ps(dummy_mask,velec);
968	velecsum = _mm_add_ps(velecsum,velec);
969	vvdw = _mm_and_ps(vvdw,cutoff_mask);
970	vvdw = _mm_andnot_ps(dummy_mask,vvdw);
971	vvdwsum = _mm_add_ps(vvdwsum,vvdw);
972
973	fscal = _mm_add_ps(felec,fvdw);
974
975	fscal = _mm_and_ps(fscal,cutoff_mask);
976
977	fscal = _mm_andnot_ps(dummy_mask,fscal);
978
979	/* Calculate temporary vectorial force */
980	tx = _mm_mul_ps(fscal,dx00);
981	ty = _mm_mul_ps(fscal,dy00);
982	tz = _mm_mul_ps(fscal,dz00);
983
984	/* Update vectorial force */
985	fix0 = _mm_add_ps(fix0,tx);
986	fiy0 = _mm_add_ps(fiy0,ty);
987	fiz0 = _mm_add_ps(fiz0,tz);
988
989	fjx0 = _mm_add_ps(fjx0,tx);
990	fjy0 = _mm_add_ps(fjy0,ty);
991	fjz0 = _mm_add_ps(fjz0,tz);
992
993	}
994
995	/**************************
996	* CALCULATE INTERACTIONS *
997	**************************/
998
999	if (gmx_mm_any_lt(rsq01,rcutoff2))
1000	{
1001
1002	r01 = _mm_mul_ps(rsq01,rinv01);
1003	r01 = _mm_andnot_ps(dummy_mask,r01);
1004
1005	/* EWALD ELECTROSTATICS */
1006
1007	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1008	ewrt = _mm_mul_ps(r01,ewtabscale);
1009	ewitab = _mm_cvttps_epi32(ewrt);
1010	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1011	ewitab = _mm_slli_epi32(ewitab,2);
1012	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1013	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1014	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1015	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1016	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1017	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1018	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1019	velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
1020	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1021
1022	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1023
1024	/* Update potential sum for this i atom from the interaction with this j atom. */
1025	velec = _mm_and_ps(velec,cutoff_mask);
1026	velec = _mm_andnot_ps(dummy_mask,velec);
1027	velecsum = _mm_add_ps(velecsum,velec);
1028
1029	fscal = felec;
1030
1031	fscal = _mm_and_ps(fscal,cutoff_mask);
1032
1033	fscal = _mm_andnot_ps(dummy_mask,fscal);
1034
1035	/* Calculate temporary vectorial force */
1036	tx = _mm_mul_ps(fscal,dx01);
1037	ty = _mm_mul_ps(fscal,dy01);
1038	tz = _mm_mul_ps(fscal,dz01);
1039
1040	/* Update vectorial force */
1041	fix0 = _mm_add_ps(fix0,tx);
1042	fiy0 = _mm_add_ps(fiy0,ty);
1043	fiz0 = _mm_add_ps(fiz0,tz);
1044
1045	fjx1 = _mm_add_ps(fjx1,tx);
1046	fjy1 = _mm_add_ps(fjy1,ty);
1047	fjz1 = _mm_add_ps(fjz1,tz);
1048
1049	}
1050
1051	/**************************
1052	* CALCULATE INTERACTIONS *
1053	**************************/
1054
1055	if (gmx_mm_any_lt(rsq02,rcutoff2))
1056	{
1057
1058	r02 = _mm_mul_ps(rsq02,rinv02);
1059	r02 = _mm_andnot_ps(dummy_mask,r02);
1060
1061	/* EWALD ELECTROSTATICS */
1062
1063	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1064	ewrt = _mm_mul_ps(r02,ewtabscale);
1065	ewitab = _mm_cvttps_epi32(ewrt);
1066	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1067	ewitab = _mm_slli_epi32(ewitab,2);
1068	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1069	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1070	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1071	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1072	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1073	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1074	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1075	velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1076	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1077
1078	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1079
1080	/* Update potential sum for this i atom from the interaction with this j atom. */
1081	velec = _mm_and_ps(velec,cutoff_mask);
1082	velec = _mm_andnot_ps(dummy_mask,velec);
1083	velecsum = _mm_add_ps(velecsum,velec);
1084
1085	fscal = felec;
1086
1087	fscal = _mm_and_ps(fscal,cutoff_mask);
1088
1089	fscal = _mm_andnot_ps(dummy_mask,fscal);
1090
1091	/* Calculate temporary vectorial force */
1092	tx = _mm_mul_ps(fscal,dx02);
1093	ty = _mm_mul_ps(fscal,dy02);
1094	tz = _mm_mul_ps(fscal,dz02);
1095
1096	/* Update vectorial force */
1097	fix0 = _mm_add_ps(fix0,tx);
1098	fiy0 = _mm_add_ps(fiy0,ty);
1099	fiz0 = _mm_add_ps(fiz0,tz);
1100
1101	fjx2 = _mm_add_ps(fjx2,tx);
1102	fjy2 = _mm_add_ps(fjy2,ty);
1103	fjz2 = _mm_add_ps(fjz2,tz);
1104
1105	}
1106
1107	/**************************
1108	* CALCULATE INTERACTIONS *
1109	**************************/
1110
1111	if (gmx_mm_any_lt(rsq10,rcutoff2))
1112	{
1113
1114	r10 = _mm_mul_ps(rsq10,rinv10);
1115	r10 = _mm_andnot_ps(dummy_mask,r10);
1116
1117	/* EWALD ELECTROSTATICS */
1118
1119	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1120	ewrt = _mm_mul_ps(r10,ewtabscale);
1121	ewitab = _mm_cvttps_epi32(ewrt);
1122	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1123	ewitab = _mm_slli_epi32(ewitab,2);
1124	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1125	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1126	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1127	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1128	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1129	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1130	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1131	velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1132	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1133
1134	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1135
1136	/* Update potential sum for this i atom from the interaction with this j atom. */
1137	velec = _mm_and_ps(velec,cutoff_mask);
1138	velec = _mm_andnot_ps(dummy_mask,velec);
1139	velecsum = _mm_add_ps(velecsum,velec);
1140
1141	fscal = felec;
1142
1143	fscal = _mm_and_ps(fscal,cutoff_mask);
1144
1145	fscal = _mm_andnot_ps(dummy_mask,fscal);
1146
1147	/* Calculate temporary vectorial force */
1148	tx = _mm_mul_ps(fscal,dx10);
1149	ty = _mm_mul_ps(fscal,dy10);
1150	tz = _mm_mul_ps(fscal,dz10);
1151
1152	/* Update vectorial force */
1153	fix1 = _mm_add_ps(fix1,tx);
1154	fiy1 = _mm_add_ps(fiy1,ty);
1155	fiz1 = _mm_add_ps(fiz1,tz);
1156
1157	fjx0 = _mm_add_ps(fjx0,tx);
1158	fjy0 = _mm_add_ps(fjy0,ty);
1159	fjz0 = _mm_add_ps(fjz0,tz);
1160
1161	}
1162
1163	/**************************
1164	* CALCULATE INTERACTIONS *
1165	**************************/
1166
1167	if (gmx_mm_any_lt(rsq11,rcutoff2))
1168	{
1169
1170	r11 = _mm_mul_ps(rsq11,rinv11);
1171	r11 = _mm_andnot_ps(dummy_mask,r11);
1172
1173	/* EWALD ELECTROSTATICS */
1174
1175	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1176	ewrt = _mm_mul_ps(r11,ewtabscale);
1177	ewitab = _mm_cvttps_epi32(ewrt);
1178	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1179	ewitab = _mm_slli_epi32(ewitab,2);
1180	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1181	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1182	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1183	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1184	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1185	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1186	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1187	velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1188	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1189
1190	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1191
1192	/* Update potential sum for this i atom from the interaction with this j atom. */
1193	velec = _mm_and_ps(velec,cutoff_mask);
1194	velec = _mm_andnot_ps(dummy_mask,velec);
1195	velecsum = _mm_add_ps(velecsum,velec);
1196
1197	fscal = felec;
1198
1199	fscal = _mm_and_ps(fscal,cutoff_mask);
1200
1201	fscal = _mm_andnot_ps(dummy_mask,fscal);
1202
1203	/* Calculate temporary vectorial force */
1204	tx = _mm_mul_ps(fscal,dx11);
1205	ty = _mm_mul_ps(fscal,dy11);
1206	tz = _mm_mul_ps(fscal,dz11);
1207
1208	/* Update vectorial force */
1209	fix1 = _mm_add_ps(fix1,tx);
1210	fiy1 = _mm_add_ps(fiy1,ty);
1211	fiz1 = _mm_add_ps(fiz1,tz);
1212
1213	fjx1 = _mm_add_ps(fjx1,tx);
1214	fjy1 = _mm_add_ps(fjy1,ty);
1215	fjz1 = _mm_add_ps(fjz1,tz);
1216
1217	}
1218
1219	/**************************
1220	* CALCULATE INTERACTIONS *
1221	**************************/
1222
1223	if (gmx_mm_any_lt(rsq12,rcutoff2))
1224	{
1225
1226	r12 = _mm_mul_ps(rsq12,rinv12);
1227	r12 = _mm_andnot_ps(dummy_mask,r12);
1228
1229	/* EWALD ELECTROSTATICS */
1230
1231	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1232	ewrt = _mm_mul_ps(r12,ewtabscale);
1233	ewitab = _mm_cvttps_epi32(ewrt);
1234	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1235	ewitab = _mm_slli_epi32(ewitab,2);
1236	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1237	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1238	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1239	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1240	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1241	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1242	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1243	velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1244	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1245
1246	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1247
1248	/* Update potential sum for this i atom from the interaction with this j atom. */
1249	velec = _mm_and_ps(velec,cutoff_mask);
1250	velec = _mm_andnot_ps(dummy_mask,velec);
1251	velecsum = _mm_add_ps(velecsum,velec);
1252
1253	fscal = felec;
1254
1255	fscal = _mm_and_ps(fscal,cutoff_mask);
1256
1257	fscal = _mm_andnot_ps(dummy_mask,fscal);
1258
1259	/* Calculate temporary vectorial force */
1260	tx = _mm_mul_ps(fscal,dx12);
1261	ty = _mm_mul_ps(fscal,dy12);
1262	tz = _mm_mul_ps(fscal,dz12);
1263
1264	/* Update vectorial force */
1265	fix1 = _mm_add_ps(fix1,tx);
1266	fiy1 = _mm_add_ps(fiy1,ty);
1267	fiz1 = _mm_add_ps(fiz1,tz);
1268
1269	fjx2 = _mm_add_ps(fjx2,tx);
1270	fjy2 = _mm_add_ps(fjy2,ty);
1271	fjz2 = _mm_add_ps(fjz2,tz);
1272
1273	}
1274
1275	/**************************
1276	* CALCULATE INTERACTIONS *
1277	**************************/
1278
1279	if (gmx_mm_any_lt(rsq20,rcutoff2))
1280	{
1281
1282	r20 = _mm_mul_ps(rsq20,rinv20);
1283	r20 = _mm_andnot_ps(dummy_mask,r20);
1284
1285	/* EWALD ELECTROSTATICS */
1286
1287	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1288	ewrt = _mm_mul_ps(r20,ewtabscale);
1289	ewitab = _mm_cvttps_epi32(ewrt);
1290	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1291	ewitab = _mm_slli_epi32(ewitab,2);
1292	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1293	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1294	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1295	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1296	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1297	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1298	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1299	velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1300	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1301
1302	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1303
1304	/* Update potential sum for this i atom from the interaction with this j atom. */
1305	velec = _mm_and_ps(velec,cutoff_mask);
1306	velec = _mm_andnot_ps(dummy_mask,velec);
1307	velecsum = _mm_add_ps(velecsum,velec);
1308
1309	fscal = felec;
1310
1311	fscal = _mm_and_ps(fscal,cutoff_mask);
1312
1313	fscal = _mm_andnot_ps(dummy_mask,fscal);
1314
1315	/* Calculate temporary vectorial force */
1316	tx = _mm_mul_ps(fscal,dx20);
1317	ty = _mm_mul_ps(fscal,dy20);
1318	tz = _mm_mul_ps(fscal,dz20);
1319
1320	/* Update vectorial force */
1321	fix2 = _mm_add_ps(fix2,tx);
1322	fiy2 = _mm_add_ps(fiy2,ty);
1323	fiz2 = _mm_add_ps(fiz2,tz);
1324
1325	fjx0 = _mm_add_ps(fjx0,tx);
1326	fjy0 = _mm_add_ps(fjy0,ty);
1327	fjz0 = _mm_add_ps(fjz0,tz);
1328
1329	}
1330
1331	/**************************
1332	* CALCULATE INTERACTIONS *
1333	**************************/
1334
1335	if (gmx_mm_any_lt(rsq21,rcutoff2))
1336	{
1337
1338	r21 = _mm_mul_ps(rsq21,rinv21);
1339	r21 = _mm_andnot_ps(dummy_mask,r21);
1340
1341	/* EWALD ELECTROSTATICS */
1342
1343	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1344	ewrt = _mm_mul_ps(r21,ewtabscale);
1345	ewitab = _mm_cvttps_epi32(ewrt);
1346	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1347	ewitab = _mm_slli_epi32(ewitab,2);
1348	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1349	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1350	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1351	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1352	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1353	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1354	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1355	velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1356	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1357
1358	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1359
1360	/* Update potential sum for this i atom from the interaction with this j atom. */
1361	velec = _mm_and_ps(velec,cutoff_mask);
1362	velec = _mm_andnot_ps(dummy_mask,velec);
1363	velecsum = _mm_add_ps(velecsum,velec);
1364
1365	fscal = felec;
1366
1367	fscal = _mm_and_ps(fscal,cutoff_mask);
1368
1369	fscal = _mm_andnot_ps(dummy_mask,fscal);
1370
1371	/* Calculate temporary vectorial force */
1372	tx = _mm_mul_ps(fscal,dx21);
1373	ty = _mm_mul_ps(fscal,dy21);
1374	tz = _mm_mul_ps(fscal,dz21);
1375
1376	/* Update vectorial force */
1377	fix2 = _mm_add_ps(fix2,tx);
1378	fiy2 = _mm_add_ps(fiy2,ty);
1379	fiz2 = _mm_add_ps(fiz2,tz);
1380
1381	fjx1 = _mm_add_ps(fjx1,tx);
1382	fjy1 = _mm_add_ps(fjy1,ty);
1383	fjz1 = _mm_add_ps(fjz1,tz);
1384
1385	}
1386
1387	/**************************
1388	* CALCULATE INTERACTIONS *
1389	**************************/
1390
1391	if (gmx_mm_any_lt(rsq22,rcutoff2))
1392	{
1393
1394	r22 = _mm_mul_ps(rsq22,rinv22);
1395	r22 = _mm_andnot_ps(dummy_mask,r22);
1396
1397	/* EWALD ELECTROSTATICS */
1398
1399	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1400	ewrt = _mm_mul_ps(r22,ewtabscale);
1401	ewitab = _mm_cvttps_epi32(ewrt);
1402	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1403	ewitab = _mm_slli_epi32(ewitab,2);
1404	ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})) );
1405	ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})) );
1406	ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})) );
1407	ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})) );
1408	_MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn)do { __m128 tmp3, tmp2, tmp1, tmp0; tmp0 = _mm_unpacklo_ps((ewtabF ), (ewtabD)); tmp2 = _mm_unpacklo_ps((ewtabV), (ewtabFn)); tmp1 = _mm_unpackhi_ps((ewtabF), (ewtabD)); tmp3 = _mm_unpackhi_ps ((ewtabV), (ewtabFn)); (ewtabF) = _mm_movelh_ps(tmp0, tmp2); ( ewtabD) = _mm_movehl_ps(tmp2, tmp0); (ewtabV) = _mm_movelh_ps (tmp1, tmp3); (ewtabFn) = _mm_movehl_ps(tmp3, tmp1); } while ( 0);
1409	felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1410	velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1411	velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1412	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1413
1414	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1415
1416	/* Update potential sum for this i atom from the interaction with this j atom. */
1417	velec = _mm_and_ps(velec,cutoff_mask);
1418	velec = _mm_andnot_ps(dummy_mask,velec);
1419	velecsum = _mm_add_ps(velecsum,velec);
1420
1421	fscal = felec;
1422
1423	fscal = _mm_and_ps(fscal,cutoff_mask);
1424
1425	fscal = _mm_andnot_ps(dummy_mask,fscal);
1426
1427	/* Calculate temporary vectorial force */
1428	tx = _mm_mul_ps(fscal,dx22);
1429	ty = _mm_mul_ps(fscal,dy22);
1430	tz = _mm_mul_ps(fscal,dz22);
1431
1432	/* Update vectorial force */
1433	fix2 = _mm_add_ps(fix2,tx);
1434	fiy2 = _mm_add_ps(fiy2,ty);
1435	fiz2 = _mm_add_ps(fiz2,tz);
1436
1437	fjx2 = _mm_add_ps(fjx2,tx);
1438	fjy2 = _mm_add_ps(fjy2,ty);
1439	fjz2 = _mm_add_ps(fjz2,tz);
1440
1441	}
1442
1443	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1444	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1445	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1446	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1447
1448	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1449	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1450
1451	/* Inner loop uses 459 flops */
1452	}
1453
1454	/* End of innermost loop */
1455
1456	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1457	f+i_coord_offset,fshift+i_shift_offset);
1458
1459	ggid = gid[iidx];
1460	/* Update potential energies */
1461	gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1462	gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1463
1464	/* Increment number of inner iterations */
1465	inneriter += j_index_end - j_index_start;
1466
1467	/* Outer loop uses 20 flops */
1468	}
1469
1470	/* Increment number of outer iterations */
1471	outeriter += nri;
1472
1473	/* Update outer/inner flops */
1474
1475	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter20 + inneriter459)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_VF] += outeriter20 + inneriter459;
1476	}
1477	/*
1478	* Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single
1479	* Electrostatics interaction: Ewald
1480	* VdW interaction: LJEwald
1481	* Geometry: Water3-Water3
1482	* Calculate force/pot: Force
1483	*/
1484	void
1485	nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse4_1_single
1486	(t_nblist * gmx_restrict nlist,
1487	rvec * gmx_restrict xx,
1488	rvec * gmx_restrict ff,
1489	t_forcerec * gmx_restrict fr,
1490	t_mdatoms * gmx_restrict mdatoms,
1491	nb_kernel_data_t gmx_unused__attribute__ ((unused)) * gmx_restrict kernel_data,
1492	t_nrnb * gmx_restrict nrnb)
1493	{
1494	/* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1495	* just 0 for non-waters.
1496	* Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1497	* jnr indices corresponding to data put in the four positions in the SIMD register.
1498	*/
1499	int i_shift_offset,i_coord_offset,outeriter,inneriter;
1500	int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1501	int jnrA,jnrB,jnrC,jnrD;
1502	int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1503	int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1504	int iinr,jindex,jjnr,shiftidx,*gid;
1505	real rcutoff_scalar;
1506	real shiftvec,fshift,x,f;
1507	real fjptrA,fjptrB,fjptrC,fjptrD;
1508	real scratch[4*DIM3];
1509	__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1510	int vdwioffset0;
1511	__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1512	int vdwioffset1;
1513	__m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1514	int vdwioffset2;
1515	__m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1516	int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1517	__m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1518	int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1519	__m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1520	int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1521	__m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1522	__m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1523	__m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1524	__m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1525	__m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1526	__m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1527	__m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1528	__m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1529	__m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1530	__m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1531	__m128 velec,felec,velecsum,facel,crf,krf,krf2;
1532	real *charge;
1533	int nvdwtype;
1534	__m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1535	int *vdwtype;
1536	real *vdwparam;
1537	__m128 one_sixth = _mm_set1_ps(1.0/6.0);
1538	__m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1539	__m128 c6grid_00;
1540	__m128 c6grid_01;
1541	__m128 c6grid_02;
1542	__m128 c6grid_10;
1543	__m128 c6grid_11;
1544	__m128 c6grid_12;
1545	__m128 c6grid_20;
1546	__m128 c6grid_21;
1547	__m128 c6grid_22;
1548	__m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1549	real *vdwgridparam;
1550	__m128 one_half = _mm_set1_ps(0.5);
1551	__m128 minus_one = _mm_set1_ps(-1.0);
1552	__m128i ewitab;
1553	__m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1554	real *ewtab;
1555	__m128 dummy_mask,cutoff_mask;
1556	__m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1557	__m128 one = _mm_set1_ps(1.0);
1558	__m128 two = _mm_set1_ps(2.0);
1559	x = xx[0];
1560	f = ff[0];
1561
1562	nri = nlist->nri;
1563	iinr = nlist->iinr;
1564	jindex = nlist->jindex;
1565	jjnr = nlist->jjnr;
1566	shiftidx = nlist->shift;
1567	gid = nlist->gid;
1568	shiftvec = fr->shift_vec[0];
1569	fshift = fr->fshift[0];
1570	facel = _mm_set1_ps(fr->epsfac);
1571	charge = mdatoms->chargeA;
1572	nvdwtype = fr->ntype;
1573	vdwparam = fr->nbfp;
1574	vdwtype = mdatoms->typeA;
1575	vdwgridparam = fr->ljpme_c6grid;
1576	sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1577	ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1578	ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1579
1580	sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1581	ewtab = fr->ic->tabq_coul_F;
1582	ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1583	ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1584
1585	/* Setup water-specific parameters */
1586	inr = nlist->iinr[0];
1587	iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1588	iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1589	iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1590	vdwioffset0 = 2nvdwtypevdwtype[inr+0];
1591
1592	jq0 = _mm_set1_ps(charge[inr+0]);
1593	jq1 = _mm_set1_ps(charge[inr+1]);
1594	jq2 = _mm_set1_ps(charge[inr+2]);
1595	vdwjidx0A = 2*vdwtype[inr+0];
1596	qq00 = _mm_mul_ps(iq0,jq0);
1597	c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1598	c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1599	c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1600	qq01 = _mm_mul_ps(iq0,jq1);
1601	qq02 = _mm_mul_ps(iq0,jq2);
1602	qq10 = _mm_mul_ps(iq1,jq0);
1603	qq11 = _mm_mul_ps(iq1,jq1);
1604	qq12 = _mm_mul_ps(iq1,jq2);
1605	qq20 = _mm_mul_ps(iq2,jq0);
1606	qq21 = _mm_mul_ps(iq2,jq1);
1607	qq22 = _mm_mul_ps(iq2,jq2);
1608
1609	/* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1610	rcutoff_scalar = fr->rcoulomb;
1611	rcutoff = _mm_set1_ps(rcutoff_scalar);
1612	rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1613
1614	sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1615	rvdw = _mm_set1_ps(fr->rvdw);
1616
1617	/* Avoid stupid compiler warnings */
1618	jnrA = jnrB = jnrC = jnrD = 0;
1619	j_coord_offsetA = 0;
1620	j_coord_offsetB = 0;
1621	j_coord_offsetC = 0;
1622	j_coord_offsetD = 0;
1623
1624	outeriter = 0;
1625	inneriter = 0;
1626
1627	for(iidx=0;iidx<4*DIM3;iidx++)
1628	{
1629	scratch[iidx] = 0.0;
1630	}
1631
1632	/* Start outer loop over neighborlists */
1633	for(iidx=0; iidx<nri; iidx++)
1634	{
1635	/* Load shift vector for this list */
1636	i_shift_offset = DIM3*shiftidx[iidx];
1637
1638	/* Load limits for loop over neighbors */
1639	j_index_start = jindex[iidx];
1640	j_index_end = jindex[iidx+1];
1641
1642	/* Get outer coordinate index */
1643	inr = iinr[iidx];
1644	i_coord_offset = DIM3*inr;
1645
1646	/* Load i particle coords and add shift vector */
1647	gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1648	&ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1649
1650	fix0 = _mm_setzero_ps();
1651	fiy0 = _mm_setzero_ps();
1652	fiz0 = _mm_setzero_ps();
1653	fix1 = _mm_setzero_ps();
1654	fiy1 = _mm_setzero_ps();
1655	fiz1 = _mm_setzero_ps();
1656	fix2 = _mm_setzero_ps();
1657	fiy2 = _mm_setzero_ps();
1658	fiz2 = _mm_setzero_ps();
1659
1660	/* Start inner kernel loop */
1661	for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1662	{
1663
1664	/* Get j neighbor index, and coordinate index */
1665	jnrA = jjnr[jidx];
1666	jnrB = jjnr[jidx+1];
1667	jnrC = jjnr[jidx+2];
1668	jnrD = jjnr[jidx+3];
1669	j_coord_offsetA = DIM3*jnrA;
1670	j_coord_offsetB = DIM3*jnrB;
1671	j_coord_offsetC = DIM3*jnrC;
1672	j_coord_offsetD = DIM3*jnrD;
1673
1674	/* load j atom coordinates */
1675	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1676	x+j_coord_offsetC,x+j_coord_offsetD,
1677	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1678
1679	/* Calculate displacement vector */
1680	dx00 = _mm_sub_ps(ix0,jx0);
1681	dy00 = _mm_sub_ps(iy0,jy0);
1682	dz00 = _mm_sub_ps(iz0,jz0);
1683	dx01 = _mm_sub_ps(ix0,jx1);
1684	dy01 = _mm_sub_ps(iy0,jy1);
1685	dz01 = _mm_sub_ps(iz0,jz1);
1686	dx02 = _mm_sub_ps(ix0,jx2);
1687	dy02 = _mm_sub_ps(iy0,jy2);
1688	dz02 = _mm_sub_ps(iz0,jz2);
1689	dx10 = _mm_sub_ps(ix1,jx0);
1690	dy10 = _mm_sub_ps(iy1,jy0);
1691	dz10 = _mm_sub_ps(iz1,jz0);
1692	dx11 = _mm_sub_ps(ix1,jx1);
1693	dy11 = _mm_sub_ps(iy1,jy1);
1694	dz11 = _mm_sub_ps(iz1,jz1);
1695	dx12 = _mm_sub_ps(ix1,jx2);
1696	dy12 = _mm_sub_ps(iy1,jy2);
1697	dz12 = _mm_sub_ps(iz1,jz2);
1698	dx20 = _mm_sub_ps(ix2,jx0);
1699	dy20 = _mm_sub_ps(iy2,jy0);
1700	dz20 = _mm_sub_ps(iz2,jz0);
1701	dx21 = _mm_sub_ps(ix2,jx1);
1702	dy21 = _mm_sub_ps(iy2,jy1);
1703	dz21 = _mm_sub_ps(iz2,jz1);
1704	dx22 = _mm_sub_ps(ix2,jx2);
1705	dy22 = _mm_sub_ps(iy2,jy2);
1706	dz22 = _mm_sub_ps(iz2,jz2);
1707
1708	/* Calculate squared distance and things based on it */
1709	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1710	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1711	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1712	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1713	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1714	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1715	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1716	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1717	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1718
1719	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
1720	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
1721	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
1722	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
1723	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
1724	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
1725	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
1726	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
1727	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
1728
1729	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1730	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1731	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1732	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1733	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1734	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1735	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1736	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1737	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1738
1739	fjx0 = _mm_setzero_ps();
1740	fjy0 = _mm_setzero_ps();
1741	fjz0 = _mm_setzero_ps();
1742	fjx1 = _mm_setzero_ps();
1743	fjy1 = _mm_setzero_ps();
1744	fjz1 = _mm_setzero_ps();
1745	fjx2 = _mm_setzero_ps();
1746	fjy2 = _mm_setzero_ps();
1747	fjz2 = _mm_setzero_ps();
1748
1749	/**************************
1750	* CALCULATE INTERACTIONS *
1751	**************************/
1752
1753	if (gmx_mm_any_lt(rsq00,rcutoff2))
1754	{
1755
1756	r00 = _mm_mul_ps(rsq00,rinv00);
1757
1758	/* EWALD ELECTROSTATICS */
1759
1760	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761	ewrt = _mm_mul_ps(r00,ewtabscale);
1762	ewitab = _mm_cvttps_epi32(ewrt);
1763	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1764	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1765	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1766	&ewtabF,&ewtabFn);
1767	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1768	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1769
1770	/* Analytical LJ-PME */
1771	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1772	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1773	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1774	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
1775	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
1776	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1777	/* f6A = 6 * C6grid * (1 - poly) */
1778	f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1779	/* f6B = C6grid * exponent * beta^6 */
1780	f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1781	/* fvdw = 12C12/r13 - ((6C6 - f6A)/r6 + f6B)/r */
1782	fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1783
1784	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1785
1786	fscal = _mm_add_ps(felec,fvdw);
1787
1788	fscal = _mm_and_ps(fscal,cutoff_mask);
1789
1790	/* Calculate temporary vectorial force */
1791	tx = _mm_mul_ps(fscal,dx00);
1792	ty = _mm_mul_ps(fscal,dy00);
1793	tz = _mm_mul_ps(fscal,dz00);
1794
1795	/* Update vectorial force */
1796	fix0 = _mm_add_ps(fix0,tx);
1797	fiy0 = _mm_add_ps(fiy0,ty);
1798	fiz0 = _mm_add_ps(fiz0,tz);
1799
1800	fjx0 = _mm_add_ps(fjx0,tx);
1801	fjy0 = _mm_add_ps(fjy0,ty);
1802	fjz0 = _mm_add_ps(fjz0,tz);
1803
1804	}
1805
1806	/**************************
1807	* CALCULATE INTERACTIONS *
1808	**************************/
1809
1810	if (gmx_mm_any_lt(rsq01,rcutoff2))
1811	{
1812
1813	r01 = _mm_mul_ps(rsq01,rinv01);
1814
1815	/* EWALD ELECTROSTATICS */
1816
1817	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1818	ewrt = _mm_mul_ps(r01,ewtabscale);
1819	ewitab = _mm_cvttps_epi32(ewrt);
1820	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1821	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1822	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1823	&ewtabF,&ewtabFn);
1824	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1825	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1826
1827	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1828
1829	fscal = felec;
1830
1831	fscal = _mm_and_ps(fscal,cutoff_mask);
1832
1833	/* Calculate temporary vectorial force */
1834	tx = _mm_mul_ps(fscal,dx01);
1835	ty = _mm_mul_ps(fscal,dy01);
1836	tz = _mm_mul_ps(fscal,dz01);
1837
1838	/* Update vectorial force */
1839	fix0 = _mm_add_ps(fix0,tx);
1840	fiy0 = _mm_add_ps(fiy0,ty);
1841	fiz0 = _mm_add_ps(fiz0,tz);
1842
1843	fjx1 = _mm_add_ps(fjx1,tx);
1844	fjy1 = _mm_add_ps(fjy1,ty);
1845	fjz1 = _mm_add_ps(fjz1,tz);
1846
1847	}
1848
1849	/**************************
1850	* CALCULATE INTERACTIONS *
1851	**************************/
1852
1853	if (gmx_mm_any_lt(rsq02,rcutoff2))
1854	{
1855
1856	r02 = _mm_mul_ps(rsq02,rinv02);
1857
1858	/* EWALD ELECTROSTATICS */
1859
1860	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1861	ewrt = _mm_mul_ps(r02,ewtabscale);
1862	ewitab = _mm_cvttps_epi32(ewrt);
1863	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1864	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1865	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1866	&ewtabF,&ewtabFn);
1867	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1868	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1869
1870	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1871
1872	fscal = felec;
1873
1874	fscal = _mm_and_ps(fscal,cutoff_mask);
1875
1876	/* Calculate temporary vectorial force */
1877	tx = _mm_mul_ps(fscal,dx02);
1878	ty = _mm_mul_ps(fscal,dy02);
1879	tz = _mm_mul_ps(fscal,dz02);
1880
1881	/* Update vectorial force */
1882	fix0 = _mm_add_ps(fix0,tx);
1883	fiy0 = _mm_add_ps(fiy0,ty);
1884	fiz0 = _mm_add_ps(fiz0,tz);
1885
1886	fjx2 = _mm_add_ps(fjx2,tx);
1887	fjy2 = _mm_add_ps(fjy2,ty);
1888	fjz2 = _mm_add_ps(fjz2,tz);
1889
1890	}
1891
1892	/**************************
1893	* CALCULATE INTERACTIONS *
1894	**************************/
1895
1896	if (gmx_mm_any_lt(rsq10,rcutoff2))
1897	{
1898
1899	r10 = _mm_mul_ps(rsq10,rinv10);
1900
1901	/* EWALD ELECTROSTATICS */
1902
1903	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904	ewrt = _mm_mul_ps(r10,ewtabscale);
1905	ewitab = _mm_cvttps_epi32(ewrt);
1906	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1907	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1908	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1909	&ewtabF,&ewtabFn);
1910	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1911	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1912
1913	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1914
1915	fscal = felec;
1916
1917	fscal = _mm_and_ps(fscal,cutoff_mask);
1918
1919	/* Calculate temporary vectorial force */
1920	tx = _mm_mul_ps(fscal,dx10);
1921	ty = _mm_mul_ps(fscal,dy10);
1922	tz = _mm_mul_ps(fscal,dz10);
1923
1924	/* Update vectorial force */
1925	fix1 = _mm_add_ps(fix1,tx);
1926	fiy1 = _mm_add_ps(fiy1,ty);
1927	fiz1 = _mm_add_ps(fiz1,tz);
1928
1929	fjx0 = _mm_add_ps(fjx0,tx);
1930	fjy0 = _mm_add_ps(fjy0,ty);
1931	fjz0 = _mm_add_ps(fjz0,tz);
1932
1933	}
1934
1935	/**************************
1936	* CALCULATE INTERACTIONS *
1937	**************************/
1938
1939	if (gmx_mm_any_lt(rsq11,rcutoff2))
1940	{
1941
1942	r11 = _mm_mul_ps(rsq11,rinv11);
1943
1944	/* EWALD ELECTROSTATICS */
1945
1946	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1947	ewrt = _mm_mul_ps(r11,ewtabscale);
1948	ewitab = _mm_cvttps_epi32(ewrt);
1949	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1950	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1951	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1952	&ewtabF,&ewtabFn);
1953	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1954	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1955
1956	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1957
1958	fscal = felec;
1959
1960	fscal = _mm_and_ps(fscal,cutoff_mask);
1961
1962	/* Calculate temporary vectorial force */
1963	tx = _mm_mul_ps(fscal,dx11);
1964	ty = _mm_mul_ps(fscal,dy11);
1965	tz = _mm_mul_ps(fscal,dz11);
1966
1967	/* Update vectorial force */
1968	fix1 = _mm_add_ps(fix1,tx);
1969	fiy1 = _mm_add_ps(fiy1,ty);
1970	fiz1 = _mm_add_ps(fiz1,tz);
1971
1972	fjx1 = _mm_add_ps(fjx1,tx);
1973	fjy1 = _mm_add_ps(fjy1,ty);
1974	fjz1 = _mm_add_ps(fjz1,tz);
1975
1976	}
1977
1978	/**************************
1979	* CALCULATE INTERACTIONS *
1980	**************************/
1981
1982	if (gmx_mm_any_lt(rsq12,rcutoff2))
1983	{
1984
1985	r12 = _mm_mul_ps(rsq12,rinv12);
1986
1987	/* EWALD ELECTROSTATICS */
1988
1989	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1990	ewrt = _mm_mul_ps(r12,ewtabscale);
1991	ewitab = _mm_cvttps_epi32(ewrt);
1992	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
1993	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
1994	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
1995	&ewtabF,&ewtabFn);
1996	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1997	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1998
1999	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2000
2001	fscal = felec;
2002
2003	fscal = _mm_and_ps(fscal,cutoff_mask);
2004
2005	/* Calculate temporary vectorial force */
2006	tx = _mm_mul_ps(fscal,dx12);
2007	ty = _mm_mul_ps(fscal,dy12);
2008	tz = _mm_mul_ps(fscal,dz12);
2009
2010	/* Update vectorial force */
2011	fix1 = _mm_add_ps(fix1,tx);
2012	fiy1 = _mm_add_ps(fiy1,ty);
2013	fiz1 = _mm_add_ps(fiz1,tz);
2014
2015	fjx2 = _mm_add_ps(fjx2,tx);
2016	fjy2 = _mm_add_ps(fjy2,ty);
2017	fjz2 = _mm_add_ps(fjz2,tz);
2018
2019	}
2020
2021	/**************************
2022	* CALCULATE INTERACTIONS *
2023	**************************/
2024
2025	if (gmx_mm_any_lt(rsq20,rcutoff2))
2026	{
2027
2028	r20 = _mm_mul_ps(rsq20,rinv20);
2029
2030	/* EWALD ELECTROSTATICS */
2031
2032	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2033	ewrt = _mm_mul_ps(r20,ewtabscale);
2034	ewitab = _mm_cvttps_epi32(ewrt);
2035	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2036	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2037	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2038	&ewtabF,&ewtabFn);
2039	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2040	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2041
2042	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2043
2044	fscal = felec;
2045
2046	fscal = _mm_and_ps(fscal,cutoff_mask);
2047
2048	/* Calculate temporary vectorial force */
2049	tx = _mm_mul_ps(fscal,dx20);
2050	ty = _mm_mul_ps(fscal,dy20);
2051	tz = _mm_mul_ps(fscal,dz20);
2052
2053	/* Update vectorial force */
2054	fix2 = _mm_add_ps(fix2,tx);
2055	fiy2 = _mm_add_ps(fiy2,ty);
2056	fiz2 = _mm_add_ps(fiz2,tz);
2057
2058	fjx0 = _mm_add_ps(fjx0,tx);
2059	fjy0 = _mm_add_ps(fjy0,ty);
2060	fjz0 = _mm_add_ps(fjz0,tz);
2061
2062	}
2063
2064	/**************************
2065	* CALCULATE INTERACTIONS *
2066	**************************/
2067
2068	if (gmx_mm_any_lt(rsq21,rcutoff2))
2069	{
2070
2071	r21 = _mm_mul_ps(rsq21,rinv21);
2072
2073	/* EWALD ELECTROSTATICS */
2074
2075	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2076	ewrt = _mm_mul_ps(r21,ewtabscale);
2077	ewitab = _mm_cvttps_epi32(ewrt);
2078	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2079	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2080	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2081	&ewtabF,&ewtabFn);
2082	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2083	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2084
2085	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2086
2087	fscal = felec;
2088
2089	fscal = _mm_and_ps(fscal,cutoff_mask);
2090
2091	/* Calculate temporary vectorial force */
2092	tx = _mm_mul_ps(fscal,dx21);
2093	ty = _mm_mul_ps(fscal,dy21);
2094	tz = _mm_mul_ps(fscal,dz21);
2095
2096	/* Update vectorial force */
2097	fix2 = _mm_add_ps(fix2,tx);
2098	fiy2 = _mm_add_ps(fiy2,ty);
2099	fiz2 = _mm_add_ps(fiz2,tz);
2100
2101	fjx1 = _mm_add_ps(fjx1,tx);
2102	fjy1 = _mm_add_ps(fjy1,ty);
2103	fjz1 = _mm_add_ps(fjz1,tz);
2104
2105	}
2106
2107	/**************************
2108	* CALCULATE INTERACTIONS *
2109	**************************/
2110
2111	if (gmx_mm_any_lt(rsq22,rcutoff2))
2112	{
2113
2114	r22 = _mm_mul_ps(rsq22,rinv22);
2115
2116	/* EWALD ELECTROSTATICS */
2117
2118	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2119	ewrt = _mm_mul_ps(r22,ewtabscale);
2120	ewitab = _mm_cvttps_epi32(ewrt);
2121	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2122	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2123	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2124	&ewtabF,&ewtabFn);
2125	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2126	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2127
2128	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2129
2130	fscal = felec;
2131
2132	fscal = _mm_and_ps(fscal,cutoff_mask);
2133
2134	/* Calculate temporary vectorial force */
2135	tx = _mm_mul_ps(fscal,dx22);
2136	ty = _mm_mul_ps(fscal,dy22);
2137	tz = _mm_mul_ps(fscal,dz22);
2138
2139	/* Update vectorial force */
2140	fix2 = _mm_add_ps(fix2,tx);
2141	fiy2 = _mm_add_ps(fiy2,ty);
2142	fiz2 = _mm_add_ps(fiz2,tz);
2143
2144	fjx2 = _mm_add_ps(fjx2,tx);
2145	fjy2 = _mm_add_ps(fjy2,ty);
2146	fjz2 = _mm_add_ps(fjz2,tz);
2147
2148	}
2149
2150	fjptrA = f+j_coord_offsetA;
2151	fjptrB = f+j_coord_offsetB;
2152	fjptrC = f+j_coord_offsetC;
2153	fjptrD = f+j_coord_offsetD;
2154
2155	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2156	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2157
2158	/* Inner loop uses 374 flops */
2159	}
2160
2161	if(jidx<j_index_end)
2162	{
2163
2164	/* Get j neighbor index, and coordinate index */
2165	jnrlistA = jjnr[jidx];
2166	jnrlistB = jjnr[jidx+1];
2167	jnrlistC = jjnr[jidx+2];
2168	jnrlistD = jjnr[jidx+3];
2169	/* Sign of each element will be negative for non-real atoms.
2170	* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2171	* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2172	*/
2173	dummy_mask = gmx_mm_castsi128_ps_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2174	jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2175	jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2176	jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2177	jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2178	j_coord_offsetA = DIM3*jnrA;
2179	j_coord_offsetB = DIM3*jnrB;
2180	j_coord_offsetC = DIM3*jnrC;
2181	j_coord_offsetD = DIM3*jnrD;
2182
2183	/* load j atom coordinates */
2184	gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2185	x+j_coord_offsetC,x+j_coord_offsetD,
2186	&jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2187
2188	/* Calculate displacement vector */
2189	dx00 = _mm_sub_ps(ix0,jx0);
2190	dy00 = _mm_sub_ps(iy0,jy0);
2191	dz00 = _mm_sub_ps(iz0,jz0);
2192	dx01 = _mm_sub_ps(ix0,jx1);
2193	dy01 = _mm_sub_ps(iy0,jy1);
2194	dz01 = _mm_sub_ps(iz0,jz1);
2195	dx02 = _mm_sub_ps(ix0,jx2);
2196	dy02 = _mm_sub_ps(iy0,jy2);
2197	dz02 = _mm_sub_ps(iz0,jz2);
2198	dx10 = _mm_sub_ps(ix1,jx0);
2199	dy10 = _mm_sub_ps(iy1,jy0);
2200	dz10 = _mm_sub_ps(iz1,jz0);
2201	dx11 = _mm_sub_ps(ix1,jx1);
2202	dy11 = _mm_sub_ps(iy1,jy1);
2203	dz11 = _mm_sub_ps(iz1,jz1);
2204	dx12 = _mm_sub_ps(ix1,jx2);
2205	dy12 = _mm_sub_ps(iy1,jy2);
2206	dz12 = _mm_sub_ps(iz1,jz2);
2207	dx20 = _mm_sub_ps(ix2,jx0);
2208	dy20 = _mm_sub_ps(iy2,jy0);
2209	dz20 = _mm_sub_ps(iz2,jz0);
2210	dx21 = _mm_sub_ps(ix2,jx1);
2211	dy21 = _mm_sub_ps(iy2,jy1);
2212	dz21 = _mm_sub_ps(iz2,jz1);
2213	dx22 = _mm_sub_ps(ix2,jx2);
2214	dy22 = _mm_sub_ps(iy2,jy2);
2215	dz22 = _mm_sub_ps(iz2,jz2);
2216
2217	/* Calculate squared distance and things based on it */
2218	rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2219	rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2220	rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2221	rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2222	rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2223	rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2224	rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2225	rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2226	rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2227
2228	rinv00 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq00);
2229	rinv01 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq01);
2230	rinv02 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq02);
2231	rinv10 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq10);
2232	rinv11 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq11);
2233	rinv12 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq12);
2234	rinv20 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq20);
2235	rinv21 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq21);
2236	rinv22 = gmx_mm_invsqrt_psgmx_simd_invsqrt_f(rsq22);
2237
2238	rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2239	rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2240	rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2241	rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2242	rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2243	rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2244	rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2245	rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2246	rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2247
2248	fjx0 = _mm_setzero_ps();
2249	fjy0 = _mm_setzero_ps();
2250	fjz0 = _mm_setzero_ps();
2251	fjx1 = _mm_setzero_ps();
2252	fjy1 = _mm_setzero_ps();
2253	fjz1 = _mm_setzero_ps();
2254	fjx2 = _mm_setzero_ps();
2255	fjy2 = _mm_setzero_ps();
2256	fjz2 = _mm_setzero_ps();
2257
2258	/**************************
2259	* CALCULATE INTERACTIONS *
2260	**************************/
2261
2262	if (gmx_mm_any_lt(rsq00,rcutoff2))
2263	{
2264
2265	r00 = _mm_mul_ps(rsq00,rinv00);
2266	r00 = _mm_andnot_ps(dummy_mask,r00);
2267
2268	/* EWALD ELECTROSTATICS */
2269
2270	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2271	ewrt = _mm_mul_ps(r00,ewtabscale);
2272	ewitab = _mm_cvttps_epi32(ewrt);
2273	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2274	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2275	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2276	&ewtabF,&ewtabFn);
2277	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2278	felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2279
2280	/* Analytical LJ-PME */
2281	rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2282	ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2283	ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2284	exponent = gmx_simd_exp_rgmx_simd_exp_f(ewcljrsq);
2285	/* poly = exp(-(betar)^2) (1 + (betar)^2 + (betar)^4 /2) */
2286	poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2287	/* f6A = 6 * C6grid * (1 - poly) */
2288	f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2289	/* f6B = C6grid * exponent * beta^6 */
2290	f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2291	/* fvdw = 12C12/r13 - ((6C6 - f6A)/r6 + f6B)/r */
2292	fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2293
2294	cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2295
2296	fscal = _mm_add_ps(felec,fvdw);
2297
2298	fscal = _mm_and_ps(fscal,cutoff_mask);
2299
2300	fscal = _mm_andnot_ps(dummy_mask,fscal);
2301
2302	/* Calculate temporary vectorial force */
2303	tx = _mm_mul_ps(fscal,dx00);
2304	ty = _mm_mul_ps(fscal,dy00);
2305	tz = _mm_mul_ps(fscal,dz00);
2306
2307	/* Update vectorial force */
2308	fix0 = _mm_add_ps(fix0,tx);
2309	fiy0 = _mm_add_ps(fiy0,ty);
2310	fiz0 = _mm_add_ps(fiz0,tz);
2311
2312	fjx0 = _mm_add_ps(fjx0,tx);
2313	fjy0 = _mm_add_ps(fjy0,ty);
2314	fjz0 = _mm_add_ps(fjz0,tz);
2315
2316	}
2317
2318	/**************************
2319	* CALCULATE INTERACTIONS *
2320	**************************/
2321
2322	if (gmx_mm_any_lt(rsq01,rcutoff2))
2323	{
2324
2325	r01 = _mm_mul_ps(rsq01,rinv01);
2326	r01 = _mm_andnot_ps(dummy_mask,r01);
2327
2328	/* EWALD ELECTROSTATICS */
2329
2330	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2331	ewrt = _mm_mul_ps(r01,ewtabscale);
2332	ewitab = _mm_cvttps_epi32(ewrt);
2333	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2334	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2335	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2336	&ewtabF,&ewtabFn);
2337	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2338	felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2339
2340	cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2341
2342	fscal = felec;
2343
2344	fscal = _mm_and_ps(fscal,cutoff_mask);
2345
2346	fscal = _mm_andnot_ps(dummy_mask,fscal);
2347
2348	/* Calculate temporary vectorial force */
2349	tx = _mm_mul_ps(fscal,dx01);
2350	ty = _mm_mul_ps(fscal,dy01);
2351	tz = _mm_mul_ps(fscal,dz01);
2352
2353	/* Update vectorial force */
2354	fix0 = _mm_add_ps(fix0,tx);
2355	fiy0 = _mm_add_ps(fiy0,ty);
2356	fiz0 = _mm_add_ps(fiz0,tz);
2357
2358	fjx1 = _mm_add_ps(fjx1,tx);
2359	fjy1 = _mm_add_ps(fjy1,ty);
2360	fjz1 = _mm_add_ps(fjz1,tz);
2361
2362	}
2363
2364	/**************************
2365	* CALCULATE INTERACTIONS *
2366	**************************/
2367
2368	if (gmx_mm_any_lt(rsq02,rcutoff2))
2369	{
2370
2371	r02 = _mm_mul_ps(rsq02,rinv02);
2372	r02 = _mm_andnot_ps(dummy_mask,r02);
2373
2374	/* EWALD ELECTROSTATICS */
2375
2376	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2377	ewrt = _mm_mul_ps(r02,ewtabscale);
2378	ewitab = _mm_cvttps_epi32(ewrt);
2379	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2380	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2381	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2382	&ewtabF,&ewtabFn);
2383	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2384	felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2385
2386	cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2387
2388	fscal = felec;
2389
2390	fscal = _mm_and_ps(fscal,cutoff_mask);
2391
2392	fscal = _mm_andnot_ps(dummy_mask,fscal);
2393
2394	/* Calculate temporary vectorial force */
2395	tx = _mm_mul_ps(fscal,dx02);
2396	ty = _mm_mul_ps(fscal,dy02);
2397	tz = _mm_mul_ps(fscal,dz02);
2398
2399	/* Update vectorial force */
2400	fix0 = _mm_add_ps(fix0,tx);
2401	fiy0 = _mm_add_ps(fiy0,ty);
2402	fiz0 = _mm_add_ps(fiz0,tz);
2403
2404	fjx2 = _mm_add_ps(fjx2,tx);
2405	fjy2 = _mm_add_ps(fjy2,ty);
2406	fjz2 = _mm_add_ps(fjz2,tz);
2407
2408	}
2409
2410	/**************************
2411	* CALCULATE INTERACTIONS *
2412	**************************/
2413
2414	if (gmx_mm_any_lt(rsq10,rcutoff2))
2415	{
2416
2417	r10 = _mm_mul_ps(rsq10,rinv10);
2418	r10 = _mm_andnot_ps(dummy_mask,r10);
2419
2420	/* EWALD ELECTROSTATICS */
2421
2422	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2423	ewrt = _mm_mul_ps(r10,ewtabscale);
2424	ewitab = _mm_cvttps_epi32(ewrt);
2425	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2426	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2427	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2428	&ewtabF,&ewtabFn);
2429	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2430	felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2431
2432	cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2433
2434	fscal = felec;
2435
2436	fscal = _mm_and_ps(fscal,cutoff_mask);
2437
2438	fscal = _mm_andnot_ps(dummy_mask,fscal);
2439
2440	/* Calculate temporary vectorial force */
2441	tx = _mm_mul_ps(fscal,dx10);
2442	ty = _mm_mul_ps(fscal,dy10);
2443	tz = _mm_mul_ps(fscal,dz10);
2444
2445	/* Update vectorial force */
2446	fix1 = _mm_add_ps(fix1,tx);
2447	fiy1 = _mm_add_ps(fiy1,ty);
2448	fiz1 = _mm_add_ps(fiz1,tz);
2449
2450	fjx0 = _mm_add_ps(fjx0,tx);
2451	fjy0 = _mm_add_ps(fjy0,ty);
2452	fjz0 = _mm_add_ps(fjz0,tz);
2453
2454	}
2455
2456	/**************************
2457	* CALCULATE INTERACTIONS *
2458	**************************/
2459
2460	if (gmx_mm_any_lt(rsq11,rcutoff2))
2461	{
2462
2463	r11 = _mm_mul_ps(rsq11,rinv11);
2464	r11 = _mm_andnot_ps(dummy_mask,r11);
2465
2466	/* EWALD ELECTROSTATICS */
2467
2468	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2469	ewrt = _mm_mul_ps(r11,ewtabscale);
2470	ewitab = _mm_cvttps_epi32(ewrt);
2471	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2472	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2473	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2474	&ewtabF,&ewtabFn);
2475	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2476	felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2477
2478	cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2479
2480	fscal = felec;
2481
2482	fscal = _mm_and_ps(fscal,cutoff_mask);
2483
2484	fscal = _mm_andnot_ps(dummy_mask,fscal);
2485
2486	/* Calculate temporary vectorial force */
2487	tx = _mm_mul_ps(fscal,dx11);
2488	ty = _mm_mul_ps(fscal,dy11);
2489	tz = _mm_mul_ps(fscal,dz11);
2490
2491	/* Update vectorial force */
2492	fix1 = _mm_add_ps(fix1,tx);
2493	fiy1 = _mm_add_ps(fiy1,ty);
2494	fiz1 = _mm_add_ps(fiz1,tz);
2495
2496	fjx1 = _mm_add_ps(fjx1,tx);
2497	fjy1 = _mm_add_ps(fjy1,ty);
2498	fjz1 = _mm_add_ps(fjz1,tz);
2499
2500	}
2501
2502	/**************************
2503	* CALCULATE INTERACTIONS *
2504	**************************/
2505
2506	if (gmx_mm_any_lt(rsq12,rcutoff2))
2507	{
2508
2509	r12 = _mm_mul_ps(rsq12,rinv12);
2510	r12 = _mm_andnot_ps(dummy_mask,r12);
2511
2512	/* EWALD ELECTROSTATICS */
2513
2514	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2515	ewrt = _mm_mul_ps(r12,ewtabscale);
2516	ewitab = _mm_cvttps_epi32(ewrt);
2517	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2518	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2519	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2520	&ewtabF,&ewtabFn);
2521	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2522	felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2523
2524	cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2525
2526	fscal = felec;
2527
2528	fscal = _mm_and_ps(fscal,cutoff_mask);
2529
2530	fscal = _mm_andnot_ps(dummy_mask,fscal);
2531
2532	/* Calculate temporary vectorial force */
2533	tx = _mm_mul_ps(fscal,dx12);
2534	ty = _mm_mul_ps(fscal,dy12);
2535	tz = _mm_mul_ps(fscal,dz12);
2536
2537	/* Update vectorial force */
2538	fix1 = _mm_add_ps(fix1,tx);
2539	fiy1 = _mm_add_ps(fiy1,ty);
2540	fiz1 = _mm_add_ps(fiz1,tz);
2541
2542	fjx2 = _mm_add_ps(fjx2,tx);
2543	fjy2 = _mm_add_ps(fjy2,ty);
2544	fjz2 = _mm_add_ps(fjz2,tz);
2545
2546	}
2547
2548	/**************************
2549	* CALCULATE INTERACTIONS *
2550	**************************/
2551
2552	if (gmx_mm_any_lt(rsq20,rcutoff2))
2553	{
2554
2555	r20 = _mm_mul_ps(rsq20,rinv20);
2556	r20 = _mm_andnot_ps(dummy_mask,r20);
2557
2558	/* EWALD ELECTROSTATICS */
2559
2560	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2561	ewrt = _mm_mul_ps(r20,ewtabscale);
2562	ewitab = _mm_cvttps_epi32(ewrt);
2563	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2564	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2565	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2566	&ewtabF,&ewtabFn);
2567	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2568	felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2569
2570	cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2571
2572	fscal = felec;
2573
2574	fscal = _mm_and_ps(fscal,cutoff_mask);
2575
2576	fscal = _mm_andnot_ps(dummy_mask,fscal);
2577
2578	/* Calculate temporary vectorial force */
2579	tx = _mm_mul_ps(fscal,dx20);
2580	ty = _mm_mul_ps(fscal,dy20);
2581	tz = _mm_mul_ps(fscal,dz20);
2582
2583	/* Update vectorial force */
2584	fix2 = _mm_add_ps(fix2,tx);
2585	fiy2 = _mm_add_ps(fiy2,ty);
2586	fiz2 = _mm_add_ps(fiz2,tz);
2587
2588	fjx0 = _mm_add_ps(fjx0,tx);
2589	fjy0 = _mm_add_ps(fjy0,ty);
2590	fjz0 = _mm_add_ps(fjz0,tz);
2591
2592	}
2593
2594	/**************************
2595	* CALCULATE INTERACTIONS *
2596	**************************/
2597
2598	if (gmx_mm_any_lt(rsq21,rcutoff2))
2599	{
2600
2601	r21 = _mm_mul_ps(rsq21,rinv21);
2602	r21 = _mm_andnot_ps(dummy_mask,r21);
2603
2604	/* EWALD ELECTROSTATICS */
2605
2606	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2607	ewrt = _mm_mul_ps(r21,ewtabscale);
2608	ewitab = _mm_cvttps_epi32(ewrt);
2609	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2610	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2611	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2612	&ewtabF,&ewtabFn);
2613	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2614	felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2615
2616	cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2617
2618	fscal = felec;
2619
2620	fscal = _mm_and_ps(fscal,cutoff_mask);
2621
2622	fscal = _mm_andnot_ps(dummy_mask,fscal);
2623
2624	/* Calculate temporary vectorial force */
2625	tx = _mm_mul_ps(fscal,dx21);
2626	ty = _mm_mul_ps(fscal,dy21);
2627	tz = _mm_mul_ps(fscal,dz21);
2628
2629	/* Update vectorial force */
2630	fix2 = _mm_add_ps(fix2,tx);
2631	fiy2 = _mm_add_ps(fiy2,ty);
2632	fiz2 = _mm_add_ps(fiz2,tz);
2633
2634	fjx1 = _mm_add_ps(fjx1,tx);
2635	fjy1 = _mm_add_ps(fjy1,ty);
2636	fjz1 = _mm_add_ps(fjz1,tz);
2637
2638	}
2639
2640	/**************************
2641	* CALCULATE INTERACTIONS *
2642	**************************/
2643
2644	if (gmx_mm_any_lt(rsq22,rcutoff2))
2645	{
2646
2647	r22 = _mm_mul_ps(rsq22,rinv22);
2648	r22 = _mm_andnot_ps(dummy_mask,r22);
2649
2650	/* EWALD ELECTROSTATICS */
2651
2652	/* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2653	ewrt = _mm_mul_ps(r22,ewtabscale);
2654	ewitab = _mm_cvttps_epi32(ewrt);
2655	eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR)__extension__ ({ __m128 __X = (ewrt); (__m128) __builtin_ia32_roundps ((__v4sf)__X, ((0x00 \| 0x01))); }));
2656	gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(0) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,1)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(1) & 3];})),
2657	ewtab + gmx_mm_extract_epi32(ewitab,2)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(2) & 3];})),ewtab + gmx_mm_extract_epi32(ewitab,3)(__extension__ ({ __v4si __a = (__v4si)(ewitab); __a[(3) & 3];})),
2658	&ewtabF,&ewtabFn);
2659	felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2660	felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2661
2662	cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2663
2664	fscal = felec;
2665
2666	fscal = _mm_and_ps(fscal,cutoff_mask);
2667
2668	fscal = _mm_andnot_ps(dummy_mask,fscal);
2669
2670	/* Calculate temporary vectorial force */
2671	tx = _mm_mul_ps(fscal,dx22);
2672	ty = _mm_mul_ps(fscal,dy22);
2673	tz = _mm_mul_ps(fscal,dz22);
2674
2675	/* Update vectorial force */
2676	fix2 = _mm_add_ps(fix2,tx);
2677	fiy2 = _mm_add_ps(fiy2,ty);
2678	fiz2 = _mm_add_ps(fiz2,tz);
2679
2680	fjx2 = _mm_add_ps(fjx2,tx);
2681	fjy2 = _mm_add_ps(fjy2,ty);
2682	fjz2 = _mm_add_ps(fjz2,tz);
2683
2684	}
2685
2686	fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2687	fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2688	fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2689	fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2690
2691	gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2692	fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2693
2694	/* Inner loop uses 383 flops */
2695	}
2696
2697	/* End of innermost loop */
2698
2699	gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2700	f+i_coord_offset,fshift+i_shift_offset);
2701
2702	/* Increment number of inner iterations */
2703	inneriter += j_index_end - j_index_start;
2704
2705	/* Outer loop uses 18 flops */
2706	}
2707
2708	/* Increment number of outer iterations */
2709	outeriter += nri;
2710
2711	/* Update outer/inner flops */
2712
2713	inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter18 + inneriter383)(nrnb)->n[eNR_NBKERNEL_ELEC_VDW_W3W3_F] += outeriter18 + inneriter 383;
2714	}